diff --git a/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/llvm/include/llvm/CodeGen/CalcSpillWeights.h index 5a86dd9650fbd..11d1c16561507 100644 --- a/llvm/include/llvm/CodeGen/CalcSpillWeights.h +++ b/llvm/include/llvm/CodeGen/CalcSpillWeights.h @@ -81,6 +81,14 @@ class VirtRegMap; static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, const VirtRegMap &VRM, + const MachineRegisterInfo &MRI, + const TargetInstrInfo &TII); + + /// \returns true if all registers used by \p MI are also available with the + /// same value at \p UseIdx. + static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, const TargetInstrInfo &TII); protected: diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h index db1785de255f0..6473138a801f7 100644 --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -189,11 +189,6 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { explicit Remat(const VNInfo *ParentVNI) : ParentVNI(ParentVNI) {} }; - /// allUsesAvailableAt - Return true if all registers used by OrigMI at - /// OrigIdx are also available with the same value at UseIdx. - bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx, - SlotIndex UseIdx) const; - /// canRematerializeAt - Determine if ParentVNI can be rematerialized at /// UseIdx. It is assumed that parent_.getVNINfoAt(UseIdx) == ParentVNI. bool canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx); diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index f5a6eec57ee7e..a77da01761579 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -81,12 +81,15 @@ Register VirtRegAuxInfo::copyHint(const MachineInstr *MI, Register Reg, bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, const VirtRegMap &VRM, + const MachineRegisterInfo &MRI, const TargetInstrInfo &TII) { Register Reg = LI.reg(); Register Original = VRM.getOriginal(Reg); + SmallDenseMap VNIDefs; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { const VNInfo *VNI = *I; + const VNInfo *OrigVNI = VNI; if (VNI->isUnused()) continue; if (VNI->isPHIDef()) @@ -124,6 +127,75 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, if (!TII.isReMaterializable(*MI)) return false; + + VNIDefs[OrigVNI->id] = MI; + } + + // If MI has register uses, it will only be rematerializable if its uses are + // also live at the indices it will be rematerialized at. + for (MachineOperand &MO : MRI.reg_nodbg_operands(LI.reg())) { + if (!MO.readsReg()) + continue; + SlotIndex UseIdx = LIS.getInstructionIndex(*MO.getParent()); + MachineInstr *Def = VNIDefs[LI.getVNInfoAt(UseIdx)->id]; + assert(Def && "Use with no def"); + if (!allUsesAvailableAt(Def, UseIdx, LIS, MRI, TII)) + return false; + } + + return true; +} + +bool VirtRegAuxInfo::allUsesAvailableAt(const MachineInstr *MI, + SlotIndex UseIdx, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + const TargetInstrInfo &TII) { + SlotIndex OrigIdx = LIS.getInstructionIndex(*MI).getRegSlot(true); + UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true)); + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) + continue; + + // We can't remat physreg uses, unless it is a constant or target wants + // to ignore this use. + if (MO.getReg().isPhysical()) { + if (MRI.isConstantPhysReg(MO.getReg()) || TII.isIgnorableUse(MO)) + continue; + return false; + } + + const LiveInterval &li = LIS.getInterval(MO.getReg()); + const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); + if (!OVNI) + continue; + + // Don't allow rematerialization immediately after the original def. + // It would be incorrect if OrigMI redefines the register. + // See PR14098. + if (SlotIndex::isSameInstr(OrigIdx, UseIdx)) + return false; + + if (OVNI != li.getVNInfoAt(UseIdx)) + return false; + + // Check that subrange is live at UseIdx. + if (li.hasSubRanges()) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + unsigned SubReg = MO.getSubReg(); + LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : MRI.getMaxLaneMaskForVReg(MO.getReg()); + for (const LiveInterval::SubRange &SR : li.subranges()) { + if ((SR.LaneMask & LM).none()) + continue; + if (!SR.liveAt(UseIdx)) + return false; + // Early exit if all used lanes are checked. No need to continue. + LM &= ~SR.LaneMask; + if (LM.none()) + break; + } + } } return true; } @@ -339,7 +411,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, // it is a preferred candidate for spilling. // FIXME: this gets much more complicated once we support non-trivial // re-materialization. - if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo())) + if (isRematerializable(LI, LIS, VRM, MRI, *MF.getSubtarget().getInstrInfo())) TotalWeight *= 0.5F; // Finally, we scale the weight by the scale factor of register class. diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 2f65be51bb726..59bc82dc267b5 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -92,60 +92,6 @@ bool LiveRangeEdit::anyRematerializable() { return !Remattable.empty(); } -/// allUsesAvailableAt - Return true if all registers used by OrigMI at -/// OrigIdx are also available with the same value at UseIdx. -bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, - SlotIndex OrigIdx, - SlotIndex UseIdx) const { - OrigIdx = OrigIdx.getRegSlot(true); - UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true)); - for (const MachineOperand &MO : OrigMI->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; - - // We can't remat physreg uses, unless it is a constant or target wants - // to ignore this use. - if (MO.getReg().isPhysical()) { - if (MRI.isConstantPhysReg(MO.getReg()) || TII.isIgnorableUse(MO)) - continue; - return false; - } - - LiveInterval &li = LIS.getInterval(MO.getReg()); - const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); - if (!OVNI) - continue; - - // Don't allow rematerialization immediately after the original def. - // It would be incorrect if OrigMI redefines the register. - // See PR14098. - if (SlotIndex::isSameInstr(OrigIdx, UseIdx)) - return false; - - if (OVNI != li.getVNInfoAt(UseIdx)) - return false; - - // Check that subrange is live at UseIdx. - if (li.hasSubRanges()) { - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - unsigned SubReg = MO.getSubReg(); - LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(MO.getReg()); - for (LiveInterval::SubRange &SR : li.subranges()) { - if ((SR.LaneMask & LM).none()) - continue; - if (!SR.liveAt(UseIdx)) - return false; - // Early exit if all used lanes are checked. No need to continue. - LM &= ~SR.LaneMask; - if (LM.none()) - break; - } - } - } - return true; -} - bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx) { assert(ScannedRemattable && "Call anyRematerializable first"); @@ -155,12 +101,10 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI, return false; // No defining instruction provided. - SlotIndex DefIdx; assert(RM.OrigMI && "No defining instruction for remattable value"); - DefIdx = LIS.getInstructionIndex(*RM.OrigMI); // Verify that all used registers are available with the same values. - if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(RM.OrigMI, UseIdx, LIS, MRI, TII)) return false; return true; @@ -221,8 +165,8 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, // Since we're moving the DefMI load, make sure we're not extending any live // ranges. - if (!allUsesAvailableAt(DefMI, LIS.getInstructionIndex(*DefMI), - LIS.getInstructionIndex(*UseMI))) + if (!VirtRegAuxInfo::allUsesAvailableAt( + DefMI, LIS.getInstructionIndex(*UseMI), LIS, MRI, TII)) return false; // We also need to make sure it is safe to move the load. diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index 1cde094d78e23..b2731b691d54c 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -927,7 +927,7 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const { Ret.HintWeights += Freq; } Ret.IsRemat = VirtRegAuxInfo::isRematerializable( - LI, *LIS, *VRM, *MF.getSubtarget().getInstrInfo()); + LI, *LIS, *VRM, *MRI, *MF.getSubtarget().getInstrInfo()); return Ret; } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index d18f702bf526e..e3b6bfc03f467 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -1393,10 +1394,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, } } - SmallVector NewRegs; - LiveRangeEdit Edit(&SrcInt, NewRegs, *MF, *LIS, nullptr, this); - SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI); - if (!Edit.allUsesAvailableAt(DefMI, DefIdx, CopyIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(DefMI, CopyIdx, *LIS, *MRI, *TII)) return false; DebugLoc DL = CopyMI->getDebugLoc(); @@ -1405,6 +1403,8 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, std::next(MachineBasicBlock::iterator(CopyMI)); LiveRangeEdit::Remat RM(ValNo); RM.OrigMI = DefMI; + SmallVector NewRegs; + LiveRangeEdit Edit(&SrcInt, NewRegs, *MF, *LIS, nullptr, this); Edit.rematerializeAt(*MBB, MII, DstReg, RM, *TRI, false, SrcIdx, CopyMI); MachineInstr &NewMI = *std::prev(MII); NewMI.setDebugLoc(DL); diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index be08c4e33f072..8424917b2d604 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -17296,8 +17296,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -17305,133 +17312,93 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -17440,31 +17407,31 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -17476,140 +17443,206 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -17618,306 +17651,277 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -17944,108 +17948,112 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB15_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18054,17 +18062,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18074,15 +18082,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18092,15 +18100,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18110,15 +18118,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18128,15 +18136,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18146,15 +18154,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18164,15 +18172,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18181,16 +18189,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18200,15 +18209,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18218,84 +18227,79 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -18304,15 +18308,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -18321,15 +18325,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18339,9 +18343,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -18357,106 +18361,110 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18464,14 +18472,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18479,14 +18487,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18516,7 +18524,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32i32_scalar: @@ -18538,113 +18546,115 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -18653,29 +18663,28 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -18685,130 +18694,141 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -18817,208 +18837,197 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -19049,85 +19058,95 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB15_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -19136,26 +19155,25 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -19163,8 +19181,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19176,9 +19194,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19190,14 +19208,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -19205,280 +19223,280 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19523,504 +19541,524 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -20051,32 +20089,48 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -20123,348 +20177,352 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -27132,24 +27190,23 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -27159,165 +27216,177 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -27325,16 +27394,6 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -27342,212 +27401,238 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB19_3 +; SI-NEXT: .LBB19_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB19_3 -; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB19_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB19_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -27561,105 +27646,107 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: .LBB19_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -27678,41 +27765,6 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -29901,15 +29953,13 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -29919,13 +29969,15 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -30008,87 +30060,82 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -30104,7 +30151,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -30125,7 +30172,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 @@ -30139,6 +30186,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -30147,17 +30196,19 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -30179,133 +30230,122 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v62 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -30321,14 +30361,16 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -30338,37 +30380,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v50, v28 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v56, v8 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -30379,41 +30421,45 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -30422,7 +30468,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -30431,7 +30477,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -30440,7 +30486,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -30449,7 +30495,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -30458,7 +30504,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -30467,7 +30513,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -30476,7 +30522,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -30485,7 +30531,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -30495,20 +30541,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30519,7 +30554,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30530,7 +30565,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30541,7 +30576,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30552,7 +30587,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30563,7 +30598,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30572,9 +30607,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30584,8 +30619,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30596,7 +30631,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30607,7 +30642,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30617,30 +30652,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -35860,179 +35902,162 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -36040,15 +36065,18 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -36056,14 +36084,40 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB27_3 -; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB27_3 +; SI-NEXT: .LBB27_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB27_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB27_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -36104,143 +36158,143 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -36249,7 +36303,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: .LBB27_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -36268,35 +36322,6 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB27_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -53997,8 +54022,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -54006,133 +54038,93 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -54141,31 +54133,31 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -54177,140 +54169,206 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB39_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -54319,306 +54377,277 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -54645,108 +54674,112 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB39_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54755,17 +54788,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54775,15 +54808,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54793,15 +54826,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54811,15 +54844,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54829,15 +54862,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54847,15 +54880,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54865,15 +54898,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54882,16 +54915,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54901,15 +54935,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54919,84 +54953,79 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -55005,15 +55034,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -55022,15 +55051,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -55040,9 +55069,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -55058,106 +55087,110 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -55165,14 +55198,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -55180,14 +55213,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -55217,7 +55250,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32f32_scalar: @@ -55239,113 +55272,115 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -55354,29 +55389,28 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -55386,130 +55420,141 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -55518,208 +55563,197 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -55750,85 +55784,95 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB39_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -55837,26 +55881,25 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -55864,8 +55907,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -55877,9 +55920,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -55891,14 +55934,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -55906,280 +55949,280 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -56224,504 +56267,524 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -56752,32 +56815,48 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB39_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB39_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -56824,348 +56903,352 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -63879,24 +63962,23 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -63906,165 +63988,177 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -64072,16 +64166,6 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -64089,212 +64173,238 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB43_3 +; SI-NEXT: .LBB43_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB43_3 -; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB43_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB43_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -64308,105 +64418,107 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -64425,41 +64537,6 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -66648,15 +66725,13 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -66666,13 +66741,15 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -66755,87 +66832,82 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -66851,7 +66923,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -66872,7 +66944,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 @@ -66886,6 +66958,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -66894,17 +66968,19 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -66926,133 +67002,122 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_f32_e32 v44, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v46, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_f32_e32 v42, 1.0, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -67071,8 +67136,13 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -67082,40 +67152,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v50, v28 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v56, v8 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -67126,41 +67193,45 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -67169,7 +67240,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -67178,7 +67249,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -67187,7 +67258,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -67196,7 +67267,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -67205,7 +67276,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -67214,7 +67285,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -67223,7 +67294,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -67232,7 +67303,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -67242,20 +67313,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67266,7 +67326,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67277,7 +67337,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67288,7 +67348,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67299,7 +67359,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67310,7 +67370,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67319,9 +67379,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67331,8 +67391,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67343,7 +67403,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67354,7 +67414,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67364,30 +67424,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -67620,7 +67687,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 @@ -67638,10 +67706,10 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 @@ -67661,7 +67729,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 @@ -67670,9 +67737,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 @@ -67681,12 +67746,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 @@ -67694,11 +67759,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 @@ -67713,127 +67779,136 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 ; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 ; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 ; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 -; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 -; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 ; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v37 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -67841,13 +67916,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 @@ -67856,43 +67929,38 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -67905,7 +67973,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -67919,7 +67987,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -68016,20 +68084,22 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 @@ -68038,41 +68108,37 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -68092,7 +68158,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68118,23 +68184,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr40 @@ -68163,27 +68227,28 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -72560,179 +72625,162 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -72740,15 +72788,18 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -72756,14 +72807,40 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB51_3 -; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB51_3 +; SI-NEXT: .LBB51_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB51_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB51_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -72804,143 +72881,143 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -72949,7 +73026,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: .LBB51_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -72968,35 +73045,6 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -88751,8 +88799,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -88760,133 +88815,93 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -88895,31 +88910,31 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -88931,140 +88946,206 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -89073,306 +89154,277 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -89399,108 +89451,112 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB59_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89509,17 +89565,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89529,15 +89585,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89547,15 +89603,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89565,15 +89621,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89583,15 +89639,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89601,15 +89657,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89619,15 +89675,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89636,16 +89692,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89655,15 +89712,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89673,84 +89730,79 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -89759,15 +89811,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -89776,15 +89828,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89794,9 +89846,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -89812,106 +89864,110 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -89919,14 +89975,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -89934,14 +89990,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89971,7 +90027,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16i64_scalar: @@ -89993,113 +90049,115 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -90108,29 +90166,28 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -90140,130 +90197,141 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -90272,208 +90340,197 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -90504,85 +90561,95 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB59_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -90591,26 +90658,25 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -90618,8 +90684,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -90631,9 +90697,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -90645,14 +90711,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -90660,280 +90726,280 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -90978,504 +91044,524 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -91506,32 +91592,48 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB59_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB59_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -91578,348 +91680,352 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -98575,24 +98681,23 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -98602,165 +98707,177 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -98768,16 +98885,6 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -98785,212 +98892,238 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB63_3 +; SI-NEXT: .LBB63_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB63_3 -; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB63_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB63_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -99004,105 +99137,107 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: .LBB63_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -99121,41 +99256,6 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -101344,13 +101444,12 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -101360,15 +101459,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -101451,108 +101551,89 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 @@ -101582,6 +101663,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -101590,17 +101672,19 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -101617,7 +101701,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 @@ -101626,8 +101725,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc @@ -101646,108 +101745,95 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 +; SI-NEXT: v_addc_u32_e32 v46, vcc, 0, v62, vcc ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v46 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 +; SI-NEXT: v_mov_b32_e32 v58, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -101765,12 +101851,17 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 @@ -101778,39 +101869,36 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 ; SI-NEXT: v_mov_b32_e32 v50, v29 ; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -101821,32 +101909,34 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -101855,7 +101945,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -101864,7 +101954,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -101873,7 +101963,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -101882,7 +101972,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -101891,7 +101981,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -101900,7 +101990,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -101909,7 +101999,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -101918,16 +102008,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -101937,8 +102027,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101948,8 +102038,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101959,8 +102049,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101970,8 +102060,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101981,8 +102071,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101992,8 +102082,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102003,8 +102093,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102014,8 +102104,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102025,8 +102115,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102036,8 +102126,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102046,18 +102136,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 @@ -102068,20 +102156,20 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -107332,179 +107420,162 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -107512,15 +107583,18 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -107528,14 +107602,40 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB71_3 -; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB71_3 +; SI-NEXT: .LBB71_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB71_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB71_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -107576,143 +107676,143 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -107721,7 +107821,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: .LBB71_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -107740,35 +107840,6 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB71_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -123434,8 +123505,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -123443,133 +123521,93 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -123578,31 +123616,31 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -123614,140 +123652,206 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB75_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -123756,306 +123860,277 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -124082,108 +124157,112 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB75_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124192,17 +124271,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124212,15 +124291,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124230,15 +124309,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124248,15 +124327,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124266,15 +124345,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124284,15 +124363,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124302,15 +124381,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124319,16 +124398,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124338,15 +124418,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124356,84 +124436,79 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -124442,15 +124517,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -124459,15 +124534,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124477,9 +124552,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -124495,106 +124570,110 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -124602,14 +124681,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -124617,14 +124696,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124654,7 +124733,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16f64_scalar: @@ -124676,113 +124755,115 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -124791,29 +124872,28 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -124823,130 +124903,141 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -124955,208 +125046,197 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -125187,85 +125267,95 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB75_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -125274,26 +125364,25 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -125301,8 +125390,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125314,9 +125403,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125328,14 +125417,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -125343,280 +125432,280 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -125661,504 +125750,524 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -126189,32 +126298,48 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB75_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB75_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -126261,348 +126386,352 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -129496,92 +129625,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v27 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v32 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 ; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 ; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 @@ -129658,17 +129787,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s71 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_mov_b32_e32 v1, s36 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s68 -; SI-NEXT: v_mov_b32_e32 v61, s4 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 @@ -129690,45 +129819,45 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_mov_b32_e32 v26, s82 ; SI-NEXT: v_mov_b32_e32 v33, s81 ; SI-NEXT: v_mov_b32_e32 v30, s80 +; SI-NEXT: v_mov_b32_e32 v37, s71 ; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v8, s67 -; SI-NEXT: v_mov_b32_e32 v7, s66 -; SI-NEXT: v_mov_b32_e32 v24, s65 -; SI-NEXT: v_mov_b32_e32 v23, s64 -; SI-NEXT: v_mov_b32_e32 v16, s55 -; SI-NEXT: v_mov_b32_e32 v15, s54 -; SI-NEXT: v_mov_b32_e32 v28, s53 -; SI-NEXT: v_mov_b32_e32 v27, s52 -; SI-NEXT: v_mov_b32_e32 v12, s51 -; SI-NEXT: v_mov_b32_e32 v11, s50 -; SI-NEXT: v_mov_b32_e32 v32, s49 -; SI-NEXT: v_mov_b32_e32 v31, s48 -; SI-NEXT: v_mov_b32_e32 v20, s39 -; SI-NEXT: v_mov_b32_e32 v19, s38 -; SI-NEXT: v_mov_b32_e32 v36, s37 -; SI-NEXT: v_mov_b32_e32 v35, s36 -; SI-NEXT: v_mov_b32_e32 v38, s35 -; SI-NEXT: v_mov_b32_e32 v37, s34 -; SI-NEXT: v_mov_b32_e32 v48, s31 -; SI-NEXT: v_mov_b32_e32 v39, s30 +; SI-NEXT: v_mov_b32_e32 v39, s69 +; SI-NEXT: v_mov_b32_e32 v38, s68 +; SI-NEXT: v_mov_b32_e32 v53, s67 +; SI-NEXT: v_mov_b32_e32 v48, s66 +; SI-NEXT: v_mov_b32_e32 v55, s65 +; SI-NEXT: v_mov_b32_e32 v54, s64 +; SI-NEXT: v_mov_b32_e32 v43, s55 +; SI-NEXT: v_mov_b32_e32 v40, s54 +; SI-NEXT: v_mov_b32_e32 v45, s53 +; SI-NEXT: v_mov_b32_e32 v44, s52 +; SI-NEXT: v_mov_b32_e32 v47, s51 +; SI-NEXT: v_mov_b32_e32 v46, s50 +; SI-NEXT: v_mov_b32_e32 v57, s49 +; SI-NEXT: v_mov_b32_e32 v56, s48 +; SI-NEXT: v_mov_b32_e32 v61, s39 +; SI-NEXT: v_mov_b32_e32 v58, s38 +; SI-NEXT: v_mov_b32_e32 v8, s35 +; SI-NEXT: v_mov_b32_e32 v24, s31 +; SI-NEXT: v_mov_b32_e32 v23, s30 ; SI-NEXT: v_mov_b32_e32 v50, s95 ; SI-NEXT: v_mov_b32_e32 v49, s94 ; SI-NEXT: v_mov_b32_e32 v52, s93 ; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v54, s91 -; SI-NEXT: v_mov_b32_e32 v53, s90 -; SI-NEXT: v_mov_b32_e32 v40, s89 -; SI-NEXT: v_mov_b32_e32 v55, s88 +; SI-NEXT: v_mov_b32_e32 v16, s91 +; SI-NEXT: v_mov_b32_e32 v15, s90 +; SI-NEXT: v_mov_b32_e32 v28, s89 +; SI-NEXT: v_mov_b32_e32 v27, s88 ; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v43, s77 -; SI-NEXT: v_mov_b32_e32 v44, s76 -; SI-NEXT: v_mov_b32_e32 v46, s75 -; SI-NEXT: v_mov_b32_e32 v45, s74 -; SI-NEXT: v_mov_b32_e32 v47, s73 -; SI-NEXT: v_mov_b32_e32 v56, s72 -; SI-NEXT: v_mov_b32_e32 v58, s63 -; SI-NEXT: v_mov_b32_e32 v57, s62 +; SI-NEXT: v_mov_b32_e32 v11, s77 +; SI-NEXT: v_mov_b32_e32 v12, s76 +; SI-NEXT: v_mov_b32_e32 v32, s75 +; SI-NEXT: v_mov_b32_e32 v31, s74 +; SI-NEXT: v_mov_b32_e32 v19, s73 +; SI-NEXT: v_mov_b32_e32 v20, s72 +; SI-NEXT: v_mov_b32_e32 v36, s63 +; SI-NEXT: v_mov_b32_e32 v35, s62 ; SI-NEXT: v_mov_b32_e32 v60, s61 ; SI-NEXT: v_mov_b32_e32 v59, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 @@ -129739,7 +129868,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 @@ -129753,30 +129882,30 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -129788,16 +129917,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -129816,78 +129945,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s99, v63, 35 ; SI-NEXT: v_readlane_b32 s98, v63, 34 ; SI-NEXT: v_readlane_b32 s97, v63, 33 @@ -129924,20 +129991,82 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -133206,24 +133335,23 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -133233,165 +133361,177 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -133399,16 +133539,6 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -133416,212 +133546,238 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB79_3 +; SI-NEXT: .LBB79_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB79_3 -; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB79_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB79_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -133635,105 +133791,107 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: .LBB79_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -133752,41 +133910,6 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB79_2 ; ; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -136060,12 +136183,12 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 @@ -136074,38 +136197,41 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 @@ -136117,19 +136243,18 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -136152,7 +136277,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -136166,54 +136290,53 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v53, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 ; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v40 -; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v35, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 @@ -136240,71 +136363,59 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 @@ -136315,17 +136426,19 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -136351,11 +136464,15 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 @@ -136367,41 +136484,43 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 @@ -136518,7 +136637,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -136527,7 +136646,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -136536,7 +136655,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -136545,7 +136664,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 @@ -136554,7 +136673,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -136563,7 +136682,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 @@ -136572,7 +136691,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -136583,7 +136702,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136594,7 +136713,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136605,7 +136724,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136616,7 +136735,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136627,7 +136746,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136636,18 +136755,16 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 @@ -136656,25 +136773,25 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 @@ -136684,10 +136801,12 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -136874,69 +136993,67 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 ; SI-NEXT: s_lshr_b32 s46, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s46 ; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s46 ; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 ; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 ; SI-NEXT: s_lshr_b32 s46, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 ; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 ; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 ; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 ; SI-NEXT: s_lshr_b32 s46, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 ; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 ; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 ; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 ; SI-NEXT: s_lshr_b32 s46, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 ; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 ; SI-NEXT: s_lshr_b32 s46, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s46 ; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s46 ; SI-NEXT: s_lshr_b32 s46, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s46 ; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 ; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 ; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 ; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: s_lshr_b32 s46, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 ; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 ; SI-NEXT: s_lshr_b32 s46, s23, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 ; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 ; SI-NEXT: s_lshr_b32 s46, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 ; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 ; SI-NEXT: s_lshr_b32 s46, s19, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 ; SI-NEXT: s_lshr_b32 s46, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 @@ -136945,7 +137062,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 @@ -136953,25 +137070,25 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 @@ -136990,151 +137107,158 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 ; SI-NEXT: v_add_f64 v[36:37], s[28:29], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 ; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 +; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_mov_b32_e32 v15, v12 +; SI-NEXT: v_mov_b32_e32 v12, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_mov_b32_e32 v7, v42 +; SI-NEXT: v_mov_b32_e32 v42, v20 +; SI-NEXT: v_mov_b32_e32 v20, v21 +; SI-NEXT: v_mov_b32_e32 v21, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v7, v61 -; SI-NEXT: v_mov_b32_e32 v61, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_mov_b32_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 ; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: .LBB81_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -137151,19 +137275,19 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -137171,188 +137295,188 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 ; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 ; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 ; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 ; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 ; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -137390,62 +137514,62 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -141703,179 +141827,162 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -141883,15 +141990,18 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -141899,14 +142009,40 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB87_3 -; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB87_3 +; SI-NEXT: .LBB87_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB87_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB87_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -141947,143 +142083,143 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -142092,7 +142228,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: .LBB87_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -142111,35 +142247,6 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB87_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB87_2 ; ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -150763,22 +150870,22 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -150804,13 +150911,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -150819,49 +150924,46 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -150870,34 +150972,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -150907,131 +151010,155 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -151040,225 +151167,205 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -151287,14 +151394,54 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB89_3 -; VI-NEXT: .LBB89_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB89_3 +; VI-NEXT: .LBB89_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB89_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB89_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -151313,351 +151460,356 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -151670,35 +151822,26 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: .LBB89_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -151717,39 +151860,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB89_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB89_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX9: ; %bb.0: @@ -151770,31 +151880,36 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -151804,133 +151919,129 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -151938,148 +152049,149 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -152087,19 +152199,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -152121,272 +152226,291 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB89_3 ; GFX9-NEXT: .LBB89_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB89_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -152399,61 +152523,55 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -152465,8 +152583,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -152483,14 +152599,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -152498,9 +152614,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -152510,254 +152626,277 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -152769,33 +152908,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB89_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -158082,16 +158202,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v46, v15 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -158124,50 +158243,69 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: v_mov_b32_e32 v47, v16 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr36 @@ -158180,21 +158318,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr55 @@ -158204,11 +158342,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -158218,6 +158360,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill @@ -158230,21 +158376,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -158261,555 +158407,551 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v46 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[46:47] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: .LBB90_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v33, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v32, v33, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v31, v32, vcc +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v34, v15, v33, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v33, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v31, vcc +; GFX9-NEXT: v_perm_b32 v13, v17, v57, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v31, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v14, v56, v47, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v20, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v32, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v31, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 +; GFX9-NEXT: v_mov_b32_e32 v58, v31 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v61, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v59, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v58, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v63, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v62, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v56, v15, v26, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: s_waitcnt vmcnt(51) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v35, v15, v24, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v60, v17, v25, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v33, v17, v24, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v62 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v35, v17, v23, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v17, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v15, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v15, vcc -; GFX9-NEXT: v_perm_b32 v37, v1, v23, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_perm_b32 v37, v1, v22, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v17, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v48, v1, v20, s7 +; GFX9-NEXT: v_perm_b32 v48, v1, v19, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158818,8 +158960,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 @@ -158833,13 +158974,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v50, v1, v17, s7 +; GFX9-NEXT: v_perm_b32 v50, v1, v4, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158862,13 +159003,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_perm_b32 v52, v1, v4, s7 +; GFX9-NEXT: v_perm_b32 v52, v1, v3, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158887,312 +159028,319 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v31, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc -; GFX9-NEXT: v_perm_b32 v39, v1, v3, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v31, vcc +; GFX9-NEXT: v_perm_b32 v39, v1, v2, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v31, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v31, vcc +; GFX9-NEXT: v_bfe_u32 v31, v1, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_add3_u32 v31, v31, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v54, v11, v2, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc +; GFX9-NEXT: v_perm_b32 v54, v11, v1, s7 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v41, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v41, v13, v1, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc +; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v26, v31, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v15 +; GFX9-NEXT: v_add3_u32 v26, v26, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v26, v45, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v16, v44, vcc -; GFX9-NEXT: v_perm_b32 v16, v44, v13, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc +; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_perm_b32 v36, v32, v29, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; GFX9-NEXT: v_perm_b32 v38, v22, v31, s7 ; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX9-NEXT: v_perm_b32 v49, v19, v21, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v45, v43, vcc -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v45, v46, vcc -; GFX9-NEXT: v_perm_b32 v15, v15, v43, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v43 -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[15:16] -; GFX9-NEXT: v_perm_b32 v51, v6, v18, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 +; GFX9-NEXT: v_perm_b32 v49, v18, v20, s7 +; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 ; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 -; GFX9-NEXT: v_perm_b32 v57, v28, v60, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[41:42] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[41:42] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[54:55] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[54:55] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[39:40] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[39:40] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[52:53] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[52:53] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[50:51] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[50:51] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[37:38] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[56:57] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v53 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v51 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38 @@ -159200,439 +159348,449 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v56 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v61 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v59 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_mov_b32_e32 v34, v62 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v52 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v50 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v63 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v52 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v61 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 +; GFX9-NEXT: v_mov_b32_e32 v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v42 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v60 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v51 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -162096,484 +162254,485 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v59, v26 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v8 ; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v59 ; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 ; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v48 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v49 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v50 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v51 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v45 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v47 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v49, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s28 +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB91_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v43, v36 -; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 +; SI-NEXT: v_alignbit_b32 v5, v1, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 +; SI-NEXT: v_alignbit_b32 v6, v1, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; SI-NEXT: v_alignbit_b32 v10, v1, v16, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 ; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_alignbit_b32 v16, v1, v63, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 +; SI-NEXT: v_alignbit_b32 v13, v1, v40, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_alignbit_b32 v61, v1, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; SI-NEXT: v_alignbit_b32 v7, v1, v42, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 +; SI-NEXT: v_alignbit_b32 v63, v1, v27, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_alignbit_b32 v45, v1, v28, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v1, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; SI-NEXT: v_alignbit_b32 v33, v1, v50, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: v_alignbit_b32 v28, v1, v55, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_alignbit_b32 v44, v62, v9, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v56 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v55, v9, v26, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: v_alignbit_b32 v38, v9, v52, 16 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v49 +; SI-NEXT: v_alignbit_b32 v1, v1, v46, 16 +; SI-NEXT: v_alignbit_b32 v46, v47, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v36, v8 +; SI-NEXT: v_alignbit_b32 v8, v44, v2, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v8, v44, v2, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v44, v2, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v54 +; SI-NEXT: v_alignbit_b32 v19, v8, v15, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 8 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v38 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v50, v8, v60, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: v_alignbit_b32 v8, v50, v10, 24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v50, v10, 8 +; SI-NEXT: v_alignbit_b32 v25, v60, v41, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v25, v4, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v25, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v25, v4, 8 +; SI-NEXT: v_mov_b32_e32 v17, v32 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 -; SI-NEXT: v_mov_b32_e32 v45, v8 -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 -; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 +; SI-NEXT: v_alignbit_b32 v20, v50, v10, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v58, v41 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v44 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v59 +; SI-NEXT: v_alignbit_b32 v35, v9, v18, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v35, v13, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v35, v13, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 +; SI-NEXT: v_alignbit_b32 v30, v9, v21, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v30, v61, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v30, v61, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v30, v61, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_alignbit_b32 v27, v9, v24, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v27, v7, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v27, v7, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v27, v7, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v52 +; SI-NEXT: v_alignbit_b32 v24, v9, v29, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v24, v63, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v24, v63, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v34 +; SI-NEXT: v_alignbit_b32 v21, v9, v12, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v21, v45, 24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v21, v45, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v39 +; SI-NEXT: v_alignbit_b32 v18, v9, v37, 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v18, v36, 24 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v59 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 -; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v18, v36, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v35 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v18, v36, 8 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v42 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v30 +; SI-NEXT: v_alignbit_b32 v15, v9, v51, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v23 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v15, v33, 24 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v27 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v15, v33, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v52 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v9, v15, v33, 8 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v57 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v50 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v34 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v22 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v21 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v25 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v56 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v55 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v15 +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 8 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v54 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v24, v63, 24 +; SI-NEXT: v_alignbit_b32 v31, v21, v45, 16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v46 +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v19 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v57 +; SI-NEXT: v_mov_b32_e32 v52, v42 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_alignbit_b32 v12, v9, v12, 16 +; SI-NEXT: v_alignbit_b32 v9, v12, v28, 8 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v53 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v9, v9, v43, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; SI-NEXT: v_alignbit_b32 v41, v12, v28, 24 +; SI-NEXT: v_alignbit_b32 v32, v12, v28, 16 +; SI-NEXT: v_alignbit_b32 v29, v9, v1, 24 +; SI-NEXT: v_alignbit_b32 v43, v9, v1, 16 +; SI-NEXT: v_alignbit_b32 v26, v9, v1, 8 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v34, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v53 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -162711,968 +162870,987 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v17, v63 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v40, v26 +; SI-NEXT: v_mov_b32_e32 v58, v41 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; kill: killed $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; kill: killed $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB91_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v20, v11 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v40, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v54, v61 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v43 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v29, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 +; SI-NEXT: v_alignbit_b32 v54, v5, v4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_alignbit_b32 v45, v7, v5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_alignbit_b32 v63, v10, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v14 +; SI-NEXT: v_alignbit_b32 v12, v10, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v48 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 -; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 +; SI-NEXT: v_alignbit_b32 v15, v13, v10, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_alignbit_b32 v61, v16, v10, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_alignbit_b32 v18, v16, v13, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v13, v19, v13, 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 +; SI-NEXT: v_alignbit_b32 v21, v19, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v11 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v16, v22, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 +; SI-NEXT: v_alignbit_b32 v24, v22, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_alignbit_b32 v3, v25, v19, 16 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v31 +; SI-NEXT: v_alignbit_b32 v27, v25, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; SI-NEXT: v_alignbit_b32 v30, v26, v25, 16 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_alignbit_b32 v4, v28, v22, 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v59 +; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v25 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v39 +; SI-NEXT: v_alignbit_b32 v35, v34, v28, 16 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v43 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v43, v9, v1, 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 -; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v10, v33, v25, 16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v49 +; SI-NEXT: v_alignbit_b32 v38, v37, v33, 16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_alignbit_b32 v6, v36, v28, 16 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v28, v29 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v51 +; SI-NEXT: v_alignbit_b32 v55, v53, v36, 16 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 -; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v5, v48, v33, 16 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v33, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v52, 0x40c00000, v36 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v52 +; SI-NEXT: v_alignbit_b32 v25, v60, v48, 16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; SI-NEXT: v_alignbit_b32 v2, v50, v36, 16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v36, v54 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 -; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 -; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v50 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v54 +; SI-NEXT: v_alignbit_b32 v50, v19, v48, 16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v40 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v19, v19, v48, 16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v48 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v42 +; SI-NEXT: v_alignbit_b32 v46, v47, v41, 16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v42 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v40 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v52 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v51 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v49 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v39 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v32 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v31 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v39, v50, v10, 16 +; SI-NEXT: v_alignbit_b32 v42, v25, v4, 8 +; SI-NEXT: v_alignbit_b32 v40, v24, v63, 24 +; SI-NEXT: v_alignbit_b32 v31, v21, v45, 16 +; SI-NEXT: v_alignbit_b32 v32, v12, v28, 16 +; SI-NEXT: v_alignbit_b32 v29, v9, v1, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v26, v9, v1, 8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v46 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v41, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; SI-NEXT: v_alignbit_b32 v44, v62, v41, 16 +; SI-NEXT: v_alignbit_b32 v8, v44, v2, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 +; SI-NEXT: v_alignbit_b32 v8, v46, v5, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 +; SI-NEXT: v_alignbit_b32 v8, v19, v6, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 +; SI-NEXT: v_alignbit_b32 v8, v50, v10, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v50, v10, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: v_alignbit_b32 v8, v25, v4, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v25, v4, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v35, v13, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 +; SI-NEXT: v_alignbit_b32 v8, v30, v61, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v8, v30, v61, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v30, v61, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v24, v63, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v24, v63, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v45, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v21, v45, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v18, v36, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v18, v36, 16 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v18, v36, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v15, v33, 24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v15, v33, 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 +; SI-NEXT: v_alignbit_b32 v8, v15, v33, 8 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v8, v12, v28, 8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v50 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v25 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v55 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v24 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v21 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v18 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v15 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v12 +; SI-NEXT: v_alignbit_b32 v17, v44, v2, 24 +; SI-NEXT: v_alignbit_b32 v11, v44, v2, 8 +; SI-NEXT: v_alignbit_b32 v41, v12, v28, 24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v44 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 -; SI-NEXT: v_or_b32_e32 v32, v36, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 -; SI-NEXT: v_or_b32_e32 v36, v56, v36 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v14 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v5 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v17, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v49 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v63 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v54 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v51 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v16 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v38 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v37 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 +; SI-NEXT: v_or_b32_e32 v11, v14, v11 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v2 ; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v8, v8, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v61 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v30 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v2 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v2 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v10, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 @@ -163681,33 +163859,31 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 @@ -163715,20 +163891,16 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -163736,14 +163908,14 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -163755,12 +163927,12 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 @@ -163771,83 +163943,72 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v41 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v57 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v20 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -168941,39 +169102,38 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v44, v19 -; SI-NEXT: v_mov_b32_e32 v43, v17 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v10 -; SI-NEXT: v_mov_b32_e32 v41, v7 -; SI-NEXT: v_mov_b32_e32 v55, v5 -; SI-NEXT: v_mov_b32_e32 v54, v3 -; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: v_mov_b32_e32 v42, v29 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v47, v19 +; SI-NEXT: v_mov_b32_e32 v56, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: v_mov_b32_e32 v55, v9 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 @@ -168982,335 +169142,359 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 @@ -169319,118 +169503,129 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_or_b32_e32 v9, v2, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v11, v2, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v13, v2, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v15, v2, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v17, v2, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v19, v2, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v21, v2, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -169440,1780 +169635,1688 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v45, v46, v45 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v56, v56, v61 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v47, v3 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v39, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v51 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v23, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_mov_b32_e32 v8, v7 -; SI-NEXT: v_mov_b32_e32 v7, v19 -; SI-NEXT: v_or_b32_e32 v19, v2, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v23, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v31, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v51, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v4, v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v10, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v22, v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v24, v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v26, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v28, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v30, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v34, v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v34, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v20, v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v35, v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v35, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v36, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v38, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v39, v39, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v39, v39, v1 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v51, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v48, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v49, v49, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v51, v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v49, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v52, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v52, v52, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v50, v50, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v9, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v54, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v52, v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v53, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v53, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v53, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_and_b32_e32 v54, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v54, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v55, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v55, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v41, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v40, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v41, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v40, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v42, v42, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v43, v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v44, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v45, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v46, v46, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v45, v45, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v58, v58, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v59 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_or_b32_e32 v46, v46, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v60, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v61, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v47, v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v47 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v61, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v56, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v57, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v58, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v58, v58, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v59, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v59, v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v60, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v60, v60, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v61, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v61, v61, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v62, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v62, v62, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v31, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v63, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v63, v63, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB92_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v5, v3, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 ; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_mov_b32_e32 v7, v49 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v19, v44 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v56, v62, v56 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v62, v63, v62 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 ; SI-NEXT: s_movk_i32 s6, 0x300 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v57, v61, v57 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v58, v14, v58 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v23, v63, v2 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v11 +; SI-NEXT: v_or_b32_e32 v59, v12, v59 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v61, v8, v61 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v25, v25, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v31, v62, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v33, v61, v2 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v35, v60, v2 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v37, v59, v2 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v14, v14, v59 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v38, v58, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v25, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v39, v45, v2 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v32, v32, v58 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v33 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v60, v10, v60 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v6, v41 +; SI-NEXT: v_or_b32_e32 v63, v6, v63 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v20 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v0, v16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v4, v42 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v43, v4, v43 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v41, v2, v49 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v44, v4, v44 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v9, v50 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v45, v4, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v52, v2, v52 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v7, v7, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v4, v46 +; SI-NEXT: v_or_b32_e32 v13, v2, v3 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v63 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v61 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v53, v9, v53 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v47, v4, v47 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v8, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v6, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v35 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v0, v63 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v40, v9, v40 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_mov_b32_e32 v37, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v10 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v10 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -177051,22 +177154,22 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -177092,13 +177195,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -177107,49 +177208,46 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -177158,34 +177256,35 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -177195,131 +177294,155 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -177328,225 +177451,205 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -177575,14 +177678,54 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB93_3 -; VI-NEXT: .LBB93_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB93_3 +; VI-NEXT: .LBB93_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB93_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB93_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -177601,351 +177744,356 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -177958,35 +178106,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: .LBB93_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -178005,39 +178144,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB93_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB93_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX9: ; %bb.0: @@ -178058,31 +178164,36 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -178092,133 +178203,129 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -178226,148 +178333,149 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -178375,19 +178483,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -178409,272 +178510,291 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB93_3 ; GFX9-NEXT: .LBB93_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB93_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -178687,61 +178807,55 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -178753,8 +178867,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -178771,14 +178883,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -178786,9 +178898,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -178798,254 +178910,277 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -179057,33 +179192,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB93_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -182779,1037 +182895,1093 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v12, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: v_mov_b32_e32 v55, v22 +; VI-NEXT: v_mov_b32_e32 v54, v21 +; VI-NEXT: v_mov_b32_e32 v31, v19 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v60 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v23 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v21 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v19 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v33 -; VI-NEXT: v_mov_b32_e32 v33, v43 -; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v43, v33 -; VI-NEXT: v_mov_b32_e32 v33, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] -; VI-NEXT: v_mov_b32_e32 v46, v33 -; VI-NEXT: v_mov_b32_e32 v33, v53 -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] -; VI-NEXT: v_mov_b32_e32 v53, v33 -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[60:61] -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[29:30] -; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] -; VI-NEXT: v_mov_b32_e32 v36, v33 -; VI-NEXT: v_mov_b32_e32 v33, v41 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; VI-NEXT: v_mov_b32_e32 v34, v51 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v56, v38 +; VI-NEXT: v_mov_b32_e32 v45, v7 +; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: v_mov_b32_e32 v15, v3 +; VI-NEXT: v_mov_b32_e32 v28, v48 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v16, v40 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v44 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v43 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v62, v36 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[43:44] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[10:11] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_mov_b32_e32 v7, v45 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: v_mov_b32_e32 v3, v15 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_mov_b32_e32 v38, v56 +; VI-NEXT: v_mov_b32_e32 v29, v41 +; VI-NEXT: v_mov_b32_e32 v45, v60 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] +; VI-NEXT: v_mov_b32_e32 v58, v51 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v33, v31 -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] -; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 +; VI-NEXT: v_mov_b32_e32 v40, v16 +; VI-NEXT: v_mov_b32_e32 v16, v48 +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 +; VI-NEXT: v_mov_b32_e32 v39, v47 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v63, 0x200 -; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 ; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 -; VI-NEXT: v_or_b32_e32 v32, v18, v31 -; VI-NEXT: v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v20, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v58, v18, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 -; VI-NEXT: v_or_b32_e32 v31, v17, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 -; VI-NEXT: v_or_b32_e32 v32, v20, v31 -; VI-NEXT: v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: v_or_b32_e32 v31, v19, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 -; VI-NEXT: v_or_b32_e32 v32, v22, v31 -; VI-NEXT: v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: v_or_b32_e32 v31, v21, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: v_or_b32_e32 v32, v24, v31 -; VI-NEXT: v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: v_or_b32_e32 v31, v23, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 -; VI-NEXT: v_or_b32_e32 v36, v26, v31 -; VI-NEXT: v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v22, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v57, v17, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v15, v32, v0 +; VI-NEXT: v_add_f16_sdwa v0, v31, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_e32 v14, v31, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_e32 v62, v55, v0 +; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v61, v54, v0 +; VI-NEXT: v_mov_b32_e32 v26, v54 +; VI-NEXT: v_mov_b32_e32 v27, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v35, v25, v31 -; VI-NEXT: v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 -; VI-NEXT: v_or_b32_e32 v38, v28, v31 -; VI-NEXT: v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: v_or_b32_e32 v37, v27, v31 -; VI-NEXT: v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 -; VI-NEXT: v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: v_or_b32_e32 v49, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v48, v29, v31 -; VI-NEXT: v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v61, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v60, 0x200, v60 -; VI-NEXT: v_or_b32_e32 v51, v61, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v50, v60, v31 -; VI-NEXT: v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v25, v0 +; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v33, v24, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v36, v2, v0 +; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v53, v2, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v52, v1, v31 -; VI-NEXT: v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v35, v1, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v38, v2, v0 +; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v37, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v49, v9, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v46, v4, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v45, v3, v31 -; VI-NEXT: v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v48, v8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_mov_b32_e32 v9, v31 +; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v10, v32 +; VI-NEXT: v_add_f16_e32 v43, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v51, v3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_or_b32_e32 v50, v2, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v53, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v52, v1, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v46, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_e32 v45, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v43, v6, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v42, v5, v31 -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v7, v0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_or_b32_e32 v4, v6, v0 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v41, v8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v40, v7, v31 -; VI-NEXT: v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v55, v10, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v54, v9, v31 -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v39 -; VI-NEXT: v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v32, v12, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v47 -; VI-NEXT: v_or_b32_e32 v57, v14, v56 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v33 -; VI-NEXT: v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v59, v16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v63 -; VI-NEXT: v_or_b32_e32 v58, v15, v58 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v58 -; VI-NEXT: v_lshrrev_b64 v[58:59], 24, v[58:59] -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v56, v13, v56 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v57 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v56 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[56:57] -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v31, v11, v31 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v32 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32] -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v55 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v54 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[54:55] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v41 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v40 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[40:41] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v43 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v42 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v46 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v45 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v53 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v52 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] -; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46] -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[48:49] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v38 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[37:38] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v36 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v35 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_e32 v41, v7, v0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v40, v6, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_e32 v7, v25, v0 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 +; VI-NEXT: v_or_b32_e32 v6, v24, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_or_b32_e32 v32, v44, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_or_b32_e32 v31, v43, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_e32 v30, v2, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_e32 v29, v1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v34 +; VI-NEXT: v_or_b32_e32 v1, v55, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_or_b32_e32 v0, v54, v0 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[29:30] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[31:32] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v11 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] +; VI-NEXT: v_mov_b32_e32 v55, v27 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v54, v26 +; VI-NEXT: v_mov_b32_e32 v26, v20 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v22 +; VI-NEXT: v_mov_b32_e32 v13, v21 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v39 -; VI-NEXT: v_mov_b32_e32 v37, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v56, v58 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[48:49] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[50:51] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: v_mov_b32_e32 v33, v47 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v39, 8, 8 -; VI-NEXT: v_mov_b32_e32 v39, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v56 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v23 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 +; VI-NEXT: v_mov_b32_e32 v14, v8 +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v45 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v51 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[57:58] +; VI-NEXT: v_bfe_u32 v28, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v38, 8, 8 +; VI-NEXT: v_mov_b32_e32 v53, v3 +; VI-NEXT: v_bfe_u32 v15, v3, 8, 8 +; VI-NEXT: v_mov_b32_e32 v3, v59 +; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v32, v63, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v44, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_bfe_u32 v32, v47, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v44, v32 -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v55, v32 -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v36, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v58, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v57, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v59, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v34, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v53, 8, 8 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v59 -; VI-NEXT: v_mov_b32_e32 v59, v34 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v47, v0, 8, 8 ; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, v12, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -186076,7 +186248,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 ; SI-NEXT: v_mov_b32_e32 v59, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 @@ -186086,7 +186258,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 @@ -186135,7 +186307,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -186168,12 +186340,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill @@ -186193,21 +186365,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v13, v13, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 ; SI-NEXT: v_or_b32_e32 v55, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 ; SI-NEXT: v_or_b32_e32 v57, v16, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 @@ -186226,7 +186398,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 ; SI-NEXT: v_or_b32_e32 v14, v63, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_or_b32_e32 v42, v58, v11 +; SI-NEXT: v_or_b32_e32 v58, v58, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -186234,7 +186406,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 ; SI-NEXT: v_or_b32_e32 v22, v2, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 ; SI-NEXT: v_or_b32_e32 v12, v46, v12 @@ -186266,7 +186438,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v36, v2 ; SI-NEXT: v_mov_b32_e32 v35, v1 ; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill @@ -186308,15 +186480,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24 +; SI-NEXT: v_alignbit_b32 v1, v60, v58, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16 +; SI-NEXT: v_alignbit_b32 v1, v60, v58, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8 +; SI-NEXT: v_alignbit_b32 v1, v60, v58, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24 @@ -186345,13 +186517,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 ; SI-NEXT: v_or_b32_e32 v61, v50, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 ; SI-NEXT: v_or_b32_e32 v2, v48, v11 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v49, v6 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24 @@ -186387,7 +186558,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_or_b32_e32 v58, v54, v11 +; SI-NEXT: v_or_b32_e32 v56, v54, v11 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 ; SI-NEXT: v_or_b32_e32 v6, v53, v11 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill @@ -186397,34 +186568,37 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16 +; SI-NEXT: v_alignbit_b32 v1, v56, v2, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8 +; SI-NEXT: v_alignbit_b32 v1, v56, v2, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v34 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -186442,7 +186616,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v54, v40, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v56 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 @@ -186452,28 +186626,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -186496,14 +186670,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24 +; SI-NEXT: v_alignbit_b32 v28, v56, v2, 24 ; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24 ; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8 +; SI-NEXT: v_alignbit_b32 v30, v54, v6, 8 ; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8 +; SI-NEXT: v_alignbit_b32 v42, v12, v11, 16 +; SI-NEXT: v_alignbit_b32 v26, v12, v11, 8 ; SI-NEXT: v_mov_b32_e32 v20, v29 ; SI-NEXT: v_mov_b32_e32 v15, v33 ; SI-NEXT: v_mov_b32_e32 v33, v32 @@ -186518,7 +186691,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -186687,13 +186860,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -186705,13 +186877,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 @@ -186719,14 +186892,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB95_3: ; %Flow ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, v44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: v_mov_b32_e32 v6, v7 ; SI-NEXT: v_mov_b32_e32 v7, v9 @@ -186735,24 +186908,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v31, v33 ; SI-NEXT: v_mov_b32_e32 v44, v15 ; SI-NEXT: v_mov_b32_e32 v33, v20 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v50, v2 -; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v53, v30 ; SI-NEXT: v_mov_b32_e32 v40, v28 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mov_b32_e32 v2, v48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v11, v27 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v30, v29 -; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v42, v26 +; SI-NEXT: v_mov_b32_e32 v26, v52 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload @@ -186766,48 +186939,51 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -186817,22 +186993,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_or_b32_e32 v56, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 @@ -186851,61 +187027,60 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v36, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_or_b32_e32 v54, v14, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_or_b32_e32 v52, v17, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v58, v17, v19 -; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v56, v17, v19 +; SI-NEXT: v_alignbit_b32 v40, v56, v52, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v11, v21, v19 +; SI-NEXT: v_or_b32_e32 v42, v21, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v61, v21, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 @@ -186957,8 +187132,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 +; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 @@ -186977,7 +187151,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_or_b32_e32 v3, v30, v28 @@ -187012,14 +187186,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v35, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_or_b32_e32 v24, v37, v36 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; SI-NEXT: v_or_b32_e32 v42, v39, v37 +; SI-NEXT: v_or_b32_e32 v58, v39, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 ; SI-NEXT: v_mov_b32_e32 v36, v48 +; SI-NEXT: v_alignbit_b32 v38, v54, v29, 24 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 @@ -187086,7 +187261,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24 +; SI-NEXT: v_alignbit_b32 v50, v12, v11, 16 ; SI-NEXT: v_or_b32_e32 v57, v48, v37 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 @@ -187149,13 +187324,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24 +; SI-NEXT: v_alignbit_b32 v10, v60, v58, 24 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16 +; SI-NEXT: v_alignbit_b32 v10, v60, v58, 16 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8 +; SI-NEXT: v_alignbit_b32 v10, v60, v58, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24 @@ -187212,46 +187387,40 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24 +; SI-NEXT: v_alignbit_b32 v10, v61, v42, 24 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16 +; SI-NEXT: v_alignbit_b32 v10, v61, v42, 16 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8 +; SI-NEXT: v_alignbit_b32 v10, v61, v42, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16 +; SI-NEXT: v_alignbit_b32 v10, v56, v52, 16 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8 +; SI-NEXT: v_alignbit_b32 v10, v56, v52, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v56 -; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill @@ -187271,7 +187440,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v56 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54 @@ -187281,28 +187450,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v20, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v18, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v15, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v33, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v44, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v31, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v30, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v9, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v8, 8, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill @@ -187314,25 +187483,29 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_bfe_u32 v10, v26, 8, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_bfe_u32 v10, v2, 8, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 ; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24 +; SI-NEXT: v_alignbit_b32 v42, v12, v11, 8 ; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 ; SI-NEXT: v_bfe_u32 v30, v7, 8, 8 +; SI-NEXT: v_mov_b32_e32 v26, v1 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_bfe_u32 v10, v37, 8, 8 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v27, 8, 8 +; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 +; SI-NEXT: v_mov_b32_e32 v1, v37 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: .LBB95_5: ; %end ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_and_b32_e32 v37, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48 @@ -187367,12 +187540,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v51, v52, v51 ; SI-NEXT: v_or_b32_e32 v37, v37, v51 ; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v37, v37, v51 ; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -187399,12 +187572,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v20, v20, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v20, v20, v37 ; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -187431,12 +187604,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -187463,12 +187636,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v47 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -187495,12 +187668,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -187513,7 +187686,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v58 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload @@ -187529,12 +187702,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v15, v15, v18 ; SI-NEXT: v_and_b32_e32 v18, 0xff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 @@ -187564,7 +187737,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v24 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -187597,7 +187770,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v10, v10, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 @@ -187791,7 +187964,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 @@ -187806,7 +187979,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 @@ -187832,14 +188005,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v50 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -187853,7 +188024,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 @@ -188114,113 +188285,105 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshr_b32 s46, s45, 16 ; VI-NEXT: v_mov_b32_e32 v7, 0x200 -; VI-NEXT: v_add_f16_e32 v1, s46, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v11, s46, v7 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; VI-NEXT: v_add_f16_e32 v2, s45, v7 ; VI-NEXT: s_lshr_b32 s45, s44, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v23, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s45, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v40, s45, v7 ; VI-NEXT: v_add_f16_e32 v2, s44, v7 ; VI-NEXT: s_lshr_b32 s44, s43, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; VI-NEXT: v_add_f16_e32 v43, s44, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v22, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s44, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 ; VI-NEXT: v_add_f16_e32 v2, s43, v7 ; VI-NEXT: s_lshr_b32 s43, s42, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v25, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s43, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v54, s43, v7 ; VI-NEXT: v_add_f16_e32 v2, s42, v7 ; VI-NEXT: s_lshr_b32 s42, s41, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; VI-NEXT: v_add_f16_e32 v37, s42, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v24, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s42, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 ; VI-NEXT: v_add_f16_e32 v2, s41, v7 ; VI-NEXT: s_lshr_b32 s41, s40, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s41, v7 +; VI-NEXT: v_add_f16_e32 v49, s41, v7 ; VI-NEXT: v_add_f16_e32 v2, s40, v7 ; VI-NEXT: s_lshr_b32 s40, s15, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v53, s40, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; VI-NEXT: v_add_f16_e32 v52, s40, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 ; VI-NEXT: v_add_f16_e32 v2, s15, v7 ; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v29, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s15, v7 +; VI-NEXT: v_add_f16_e32 v55, s15, v7 ; VI-NEXT: v_add_f16_e32 v2, s14, v7 ; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v43, s14, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; VI-NEXT: v_add_f16_e32 v53, s14, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v28, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 ; VI-NEXT: v_add_f16_e32 v2, s13, v7 ; VI-NEXT: s_lshr_b32 s13, s12, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v6, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s13, v7 ; VI-NEXT: v_add_f16_e32 v2, s12, v7 ; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v37, s12, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v39, s12, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; VI-NEXT: v_add_f16_e32 v2, s11, v7 ; VI-NEXT: s_lshr_b32 s11, s10, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v31, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s11, v7 +; VI-NEXT: v_add_f16_e32 v60, s11, v7 ; VI-NEXT: v_add_f16_e32 v2, s10, v7 ; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v52, s10, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; VI-NEXT: v_add_f16_e32 v48, s10, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v30, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 ; VI-NEXT: v_add_f16_e32 v2, s9, v7 ; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v4, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s9, v7 ; VI-NEXT: v_add_f16_e32 v2, s8, v7 ; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v50, s8, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v2, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 ; VI-NEXT: v_add_f16_e32 v2, s7, v7 ; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v2, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s7, v7 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s17, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v36, s6, v7 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v1, v8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; VI-NEXT: v_add_f16_e32 v9, s17, v7 @@ -188228,12 +188391,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v33, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s19, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s16, v7 ; VI-NEXT: v_add_f16_e32 v38, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v32, v9, v8 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 ; VI-NEXT: v_add_f16_e32 v9, s19, v7 @@ -188241,12 +188404,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v21, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s18, v7 ; VI-NEXT: v_add_f16_e32 v61, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v20, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s20, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 @@ -188254,12 +188417,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v35, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s23, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s20, v7 ; VI-NEXT: v_add_f16_e32 v45, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v34, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s22, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 @@ -188267,12 +188430,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v19, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s22, v7 ; VI-NEXT: v_add_f16_e32 v47, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v18, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s24, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 @@ -188280,12 +188443,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v16, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s27, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s24, v7 ; VI-NEXT: v_add_f16_e32 v57, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v15, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s26, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 @@ -188293,112 +188456,116 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v13, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s29, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s26, v7 ; VI-NEXT: v_add_f16_e32 v59, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v12, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s28, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 ; VI-NEXT: v_add_f16_e32 v9, s29, v7 -; VI-NEXT: s_lshr_b32 s6, s5, 16 ; VI-NEXT: v_or_b32_e32 v10, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s7, s4, 16 -; VI-NEXT: v_add_f16_e32 v51, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s28, v7 -; VI-NEXT: v_add_f16_e32 v54, s5, v7 -; VI-NEXT: v_add_f16_e32 v11, s7, v7 -; VI-NEXT: v_add_f16_e32 v55, s4, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: v_add_f16_e32 v51, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v9, v9, v8 -; VI-NEXT: v_or_b32_e32 v8, v54, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v55, v7 -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7 +; VI-NEXT: v_add_f16_e32 v8, s5, v7 +; VI-NEXT: v_add_f16_e32 v14, s7, v7 +; VI-NEXT: v_add_f16_e32 v17, s4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v8, v8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v17, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16 -; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v37, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v43, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v53, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v11 +; VI-NEXT: v_bfe_u32 v11, v48, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v39, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[26:27] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v53, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 ; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] -; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[24:25] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 ; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 ; VI-NEXT: v_bfe_u32 v27, v59, 8, 8 @@ -188408,12 +188575,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v1, v61, 8, 8 ; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 ; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v44, v37, 8, 8 +; VI-NEXT: v_bfe_u32 v11, v43, 8, 8 ; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfe_u32 v24, v24, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: ; implicit-def: $sgpr46 @@ -188573,133 +188739,120 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: ; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s45 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s43 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s40 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s41 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s71 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s69 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s68 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s67 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s66 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s65 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s64 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s87 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s85 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s53 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s52 ; VI-NEXT: v_readlane_b32 s6, v62, 0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 1 ; VI-NEXT: v_mov_b32_e32 v36, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 3 ; VI-NEXT: v_mov_b32_e32 v38, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 5 ; VI-NEXT: v_mov_b32_e32 v61, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 7 ; VI-NEXT: v_mov_b32_e32 v45, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 9 ; VI-NEXT: v_mov_b32_e32 v47, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 11 ; VI-NEXT: v_mov_b32_e32 v57, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 13 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 ; VI-NEXT: v_mov_b32_e32 v59, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v22, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 19 ; VI-NEXT: v_mov_b32_e32 v15, s4 @@ -188714,45 +188867,48 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 24 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s80 ; VI-NEXT: v_mov_b32_e32 v46, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s81 ; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s80 -; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s81 -; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s82 -; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 ; VI-NEXT: v_mov_b32_e32 v42, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 @@ -188760,31 +188916,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 41 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s78 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s46 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s78 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 50 ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 @@ -188796,22 +188957,29 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v34, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_mov_b32_e32 v31, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 ; VI-NEXT: v_mov_b32_e32 v3, s88 ; VI-NEXT: v_readlane_b32 s6, v62, 15 ; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s71 ; VI-NEXT: v_mov_b32_e32 v50, s70 -; VI-NEXT: v_mov_b32_e32 v43, s54 -; VI-NEXT: v_mov_b32_e32 v37, s86 -; VI-NEXT: v_mov_b32_e32 v52, s84 +; VI-NEXT: v_mov_b32_e32 v54, s69 +; VI-NEXT: v_mov_b32_e32 v43, s68 +; VI-NEXT: v_mov_b32_e32 v49, s67 +; VI-NEXT: v_mov_b32_e32 v37, s66 +; VI-NEXT: v_mov_b32_e32 v55, s65 +; VI-NEXT: v_mov_b32_e32 v52, s64 +; VI-NEXT: v_mov_b32_e32 v53, s54 +; VI-NEXT: v_mov_b32_e32 v60, s87 +; VI-NEXT: v_mov_b32_e32 v39, s86 +; VI-NEXT: v_mov_b32_e32 v48, s84 ; VI-NEXT: v_mov_b32_e32 v51, s6 -; VI-NEXT: v_mov_b32_e32 v54, s5 -; VI-NEXT: v_mov_b32_e32 v23, s83 -; VI-NEXT: v_mov_b32_e32 v24, s50 +; VI-NEXT: v_mov_b32_e32 v44, s83 +; VI-NEXT: v_mov_b32_e32 v11, s50 ; VI-NEXT: v_mov_b32_e32 v26, s51 ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_mov_b32_e32 v20, s76 @@ -188820,16 +188988,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v17, s62 ; VI-NEXT: v_mov_b32_e32 v16, s60 ; VI-NEXT: v_mov_b32_e32 v13, s58 -; VI-NEXT: v_mov_b32_e32 v10, s56 -; VI-NEXT: v_mov_b32_e32 v7, s46 +; VI-NEXT: v_mov_b32_e32 v9, s56 ; VI-NEXT: v_mov_b32_e32 v3, s90 ; VI-NEXT: v_mov_b32_e32 v4, s30 ; VI-NEXT: v_mov_b32_e32 v5, s34 -; VI-NEXT: v_mov_b32_e32 v8, s36 -; VI-NEXT: v_mov_b32_e32 v11, s38 +; VI-NEXT: v_mov_b32_e32 v30, s36 +; VI-NEXT: v_mov_b32_e32 v24, s38 ; VI-NEXT: v_mov_b32_e32 v14, s48 ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 @@ -188871,30 +189038,30 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v58, v23, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -188903,36 +189070,38 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 @@ -188940,19 +189109,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 @@ -188960,19 +189131,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -188982,21 +189155,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189006,38 +189179,43 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 ; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189046,130 +189224,121 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189177,28 +189346,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189206,15 +189375,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191614,24 +191783,22 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 @@ -191648,160 +191815,164 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v20 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:360 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 @@ -191810,21 +191981,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 @@ -191833,23 +192004,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 @@ -191858,23 +192029,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 @@ -191882,25 +192053,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 @@ -191911,66 +192083,65 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -191978,664 +192149,661 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v8, v16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v12, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v5, v2, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v22, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v15, v24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v18, v36, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v7, v16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v11, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v19, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v10, v24, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v20, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v18, v25, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v19, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v30, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v38, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v50, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v52, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v43, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v44, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v45, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v47, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v58, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v59, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_or_b32_e32 v34, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v54, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v37, v37, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v57, v51, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v7, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v35, v37, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v51, v56, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v37, v51, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v56, v60, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v51, v22, v63 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v56, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v12, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v56, v60, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v54, v22, v4 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v9, v9, v22 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v40, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v53, v22, v53 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v42 -; SI-NEXT: v_or_b32_e32 v31, v22, v31 -; SI-NEXT: v_or_b32_e32 v22, v12, v61 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v43, v12, v5 -; SI-NEXT: v_alignbit_b32 v5, v22, v5, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v12, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v32, v32, v59 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v25, v6, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_alignbit_b32 v11, v5, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v42, v11, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v40, v11, v15 -; SI-NEXT: v_alignbit_b32 v11, v42, v15, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v11, v18 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v39, v11, v19 -; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v5, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v28, v11, v30 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v11, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v6, v15, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v26, v11, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v19 ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v28, v11, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v20, v11, v21 ; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v29, v11, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v21, v11, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v23, v11, v27 ; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v19, v11, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v27, v11, v52 -; SI-NEXT: v_alignbit_b32 v11, v19, v52, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v27, v11, v50 +; SI-NEXT: v_alignbit_b32 v11, v19, v50, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v2 -; SI-NEXT: v_alignbit_b32 v1, v11, v55, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v15, v1, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v21, v15, v53 +; SI-NEXT: v_alignbit_b32 v15, v11, v53, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v2 +; SI-NEXT: v_or_b32_e32 v46, v3, v43 +; SI-NEXT: v_alignbit_b32 v3, v15, v43, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v39, v3, v45 +; SI-NEXT: v_alignbit_b32 v3, v17, v45, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v61, v3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v17, v1, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v3, v61, v58, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v47, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v62, v3, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v40, v3, v36 +; SI-NEXT: v_alignbit_b32 v3, v62, v36, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v59, v3, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v31, v3, v54 +; SI-NEXT: v_alignbit_b32 v3, v59, v54, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v47, v3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v25, v3, v57 +; SI-NEXT: v_alignbit_b32 v3, v47, v57, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v16 -; SI-NEXT: v_alignbit_b32 v32, v1, v59, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v59, v6, v23 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v62, v32, v24 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v50, v6, v36 -; SI-NEXT: v_alignbit_b32 v6, v59, v36, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v47, v6, v35 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v49, v6, v37 -; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v45, v6, v34 -; SI-NEXT: v_or_b32_e32 v48, v3, v56 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v45, v3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v3, v56 ; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v44, v3, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_mov_b32_e32 v14, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -192840,163 +193008,191 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_or_b32_e32 v46, v32, v13 -; SI-NEXT: v_alignbit_b32 v13, v62, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v44, v9, 16 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v12, v1, v9 +; SI-NEXT: v_alignbit_b32 v1, v44, v9, 16 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_mov_b32_e32 v13, v25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v31 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v23, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_mov_b32_e32 v30, v16 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v31, v24 +; SI-NEXT: v_mov_b32_e32 v32, v24 ; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v60, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 -; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193005,16 +193201,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193023,15 +193218,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193040,16 +193235,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v40, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193058,15 +193253,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193075,15 +193270,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193092,15 +193287,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193109,45 +193304,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v13 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193156,15 +193357,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193173,32 +193374,32 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v18, v18, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v19, v19, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193207,83 +193408,95 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v20, v20, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v22, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v24, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v23, v17 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v26, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v29, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v24, v24, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v25, v25, v17 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 -; SI-NEXT: v_mov_b32_e32 v14, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v21, v17 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v23 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v24 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193292,17 +193505,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193311,52 +193522,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193365,310 +193575,295 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v31, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v42, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v33, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v1, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v2 -; SI-NEXT: v_alignbit_b32 v2, v22, v43, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; SI-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v6, v16, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v26, v10, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v5, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v42, v40, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v29, v23, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v26, v39, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v28, v18, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v29, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v11, v21, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v11, v16, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v15, v46, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v17, v39, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v61, v9, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v59, v14, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v47, v33, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v47, v13, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v45, v32, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v45, v35, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v44, v14, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_alignbit_b32 v2, v44, v12, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193680,9 +193875,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193694,9 +193889,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193704,13 +193899,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193718,19 +193913,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 @@ -197895,8 +198090,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill @@ -197912,8 +198107,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 @@ -197929,12 +198123,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228 @@ -197944,29 +198139,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v62, s28, 0 -; SI-NEXT: v_writelane_b32 v62, s25, 1 -; SI-NEXT: v_writelane_b32 v62, s24, 2 -; SI-NEXT: v_writelane_b32 v62, s23, 3 -; SI-NEXT: v_writelane_b32 v62, s22, 4 -; SI-NEXT: v_writelane_b32 v62, s21, 5 -; SI-NEXT: v_writelane_b32 v62, s18, 6 -; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 ; SI-NEXT: v_writelane_b32 v63, s35, 3 ; SI-NEXT: v_writelane_b32 v63, s36, 4 ; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v62, s28, 0 ; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v62, s25, 1 ; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v62, s24, 2 ; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v62, s23, 3 ; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v62, s22, 4 ; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v62, s21, 5 ; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v62, s18, 6 ; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v62, s16, 7 ; SI-NEXT: v_writelane_b32 v63, s54, 14 ; SI-NEXT: v_writelane_b32 v63, s55, 15 ; SI-NEXT: v_writelane_b32 v63, s64, 16 @@ -197988,9 +198183,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v63, s96, 32 ; SI-NEXT: v_writelane_b32 v63, s97, 33 ; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v26 ; SI-NEXT: v_readfirstlane_b32 s15, v16 ; SI-NEXT: v_readfirstlane_b32 s18, v25 ; SI-NEXT: v_readfirstlane_b32 s43, v15 @@ -197999,6 +198191,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s49, v12 ; SI-NEXT: v_readfirstlane_b32 s8, v11 ; SI-NEXT: v_readfirstlane_b32 s53, v20 +; SI-NEXT: v_readfirstlane_b32 s40, v19 +; SI-NEXT: v_readfirstlane_b32 s23, v28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v62, s4, 8 @@ -198016,7 +198210,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 @@ -198044,16 +198240,17 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s35, v43 -; SI-NEXT: v_readfirstlane_b32 s55, v46 ; SI-NEXT: v_readfirstlane_b32 s68, v35 +; SI-NEXT: v_readfirstlane_b32 s21, v36 ; SI-NEXT: v_readfirstlane_b32 s87, v37 ; SI-NEXT: v_readfirstlane_b32 s67, v39 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s74, v53 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 +; SI-NEXT: v_readfirstlane_b32 s35, v43 +; SI-NEXT: v_readfirstlane_b32 s55, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_readfirstlane_b32 s74, v53 ; SI-NEXT: v_readfirstlane_b32 s85, v48 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 @@ -198062,14 +198259,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v62, s4, 13 ; SI-NEXT: v_readfirstlane_b32 s98, v40 ; SI-NEXT: v_readfirstlane_b32 s69, v51 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v28 ; SI-NEXT: v_readfirstlane_b32 s34, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v13 -; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_readfirstlane_b32 s97, v29 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s97, v26 ; SI-NEXT: v_readfirstlane_b32 s80, v18 ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 @@ -198091,9 +198283,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s86, v61 ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s90, v50 ; SI-NEXT: v_readfirstlane_b32 s31, v52 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v55 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 @@ -198125,37 +198317,41 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s16, v34 ; SI-NEXT: v_readfirstlane_b32 s48, v32 ; SI-NEXT: v_readfirstlane_b32 s52, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: v_readfirstlane_b32 s47, v35 ; SI-NEXT: v_readfirstlane_b32 s60, v37 ; SI-NEXT: v_readfirstlane_b32 s61, v39 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_readfirstlane_b32 s54, v48 ; SI-NEXT: v_readfirstlane_b32 s89, v43 +; SI-NEXT: v_readfirstlane_b32 s50, v53 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s99, v46 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: v_readfirstlane_b32 s54, v48 -; SI-NEXT: v_readfirstlane_b32 s50, v53 ; SI-NEXT: v_readfirstlane_b32 s78, v49 ; SI-NEXT: v_readfirstlane_b32 s30, v51 ; SI-NEXT: v_readfirstlane_b32 s66, v54 ; SI-NEXT: v_readfirstlane_b32 s91, v40 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s6, v44 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s4, v10 ; SI-NEXT: v_writelane_b32 v62, s4, 15 ; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; SI-NEXT: v_writelane_b32 v62, s17, 17 ; SI-NEXT: v_writelane_b32 v62, s15, 18 ; SI-NEXT: v_writelane_b32 v62, s18, 19 @@ -198196,34 +198392,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v62, s52, 51 ; SI-NEXT: v_writelane_b32 v62, s82, 52 ; SI-NEXT: v_writelane_b32 v62, s66, 53 -; SI-NEXT: v_readfirstlane_b32 s22, v36 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v57 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v58 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v59 ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v60 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v31 ; SI-NEXT: v_writelane_b32 v62, s91, 54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s5, v62, 5 ; SI-NEXT: s_and_b32 s4, s20, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 @@ -198398,106 +198587,111 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s24, s9, s14 ; SI-NEXT: s_and_b32 s14, s93, 0xff ; SI-NEXT: s_lshl_b32 s15, s84, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v53, v6, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v50, s14, v53 +; SI-NEXT: v_or_b32_e32 v47, s14, v53 ; SI-NEXT: s_and_b32 s14, s8, 0xff ; SI-NEXT: s_lshl_b32 s15, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v54, v14, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v17, s14, v54 +; SI-NEXT: v_or_b32_e32 v45, s14, v54 ; SI-NEXT: s_and_b32 s14, s40, 0xff ; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v55, v18, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v16, s14, v55 +; SI-NEXT: v_or_b32_e32 v11, s14, v55 ; SI-NEXT: s_and_b32 s14, s34, 0xff ; SI-NEXT: s_lshl_b32 s15, s23, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v40, v19, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v15, s14, v40 +; SI-NEXT: v_or_b32_e32 v50, s14, v40 ; SI-NEXT: s_and_b32 s14, s91, 0xff ; SI-NEXT: s_lshl_b32 s15, s66, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v41, v22, v1 +; SI-NEXT: v_or_b32_e32 v41, v20, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v12, s14, v41 +; SI-NEXT: v_or_b32_e32 v17, s14, v41 ; SI-NEXT: s_and_b32 s14, s50, 0xff ; SI-NEXT: s_lshl_b32 s15, s54, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v42, v23, v1 +; SI-NEXT: v_or_b32_e32 v42, v22, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v11, s14, v42 +; SI-NEXT: v_or_b32_e32 v16, s14, v42 ; SI-NEXT: s_and_b32 s14, s73, 0xff ; SI-NEXT: s_lshl_b32 s15, s36, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v28, v59, v1 +; SI-NEXT: v_or_b32_e32 v59, v23, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v10, s14, v28 +; SI-NEXT: v_or_b32_e32 v15, s14, v59 ; SI-NEXT: s_and_b32 s14, s82, 0xff ; SI-NEXT: s_lshl_b32 s15, s52, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v60, v24, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v9, s14, v60 +; SI-NEXT: v_or_b32_e32 v12, s14, v60 ; SI-NEXT: s_and_b32 s14, s90, 0xff ; SI-NEXT: s_lshl_b32 s15, s16, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v31, v44, v1 +; SI-NEXT: v_or_b32_e32 v2, v25, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v8, s14, v31 +; SI-NEXT: v_or_b32_e32 v10, s14, v2 ; SI-NEXT: s_and_b32 s14, s62, 0xff ; SI-NEXT: s_lshl_b32 s15, s65, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v61, v45, v1 +; SI-NEXT: v_or_b32_e32 v61, v26, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, s14, v61 +; SI-NEXT: v_or_b32_e32 v8, s14, v61 ; SI-NEXT: s_and_b32 s14, s98, 0xff ; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v6, v47, v1 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_or_b32_e32 v6, v27, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v4, s14, v6 +; SI-NEXT: v_or_b32_e32 v7, s14, v6 ; SI-NEXT: s_and_b32 s14, s92, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_readlane_b32 s8, v62, 55 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v22, v14 -; SI-NEXT: v_or_b32_e32 v14, v56, v1 +; SI-NEXT: v_mov_b32_e32 v28, v38 +; SI-NEXT: v_mov_b32_e32 v38, v25 +; SI-NEXT: v_mov_b32_e32 v25, v35 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_mov_b32_e32 v22, v32 +; SI-NEXT: v_mov_b32_e32 v32, v18 +; SI-NEXT: v_mov_b32_e32 v18, v13 +; SI-NEXT: v_or_b32_e32 v13, v46, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v2, s14, v14 +; SI-NEXT: v_or_b32_e32 v4, s14, v13 ; SI-NEXT: s_and_b32 s14, s70, 0xff ; SI-NEXT: s_lshl_b32 s15, s94, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 @@ -198506,17 +198700,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: v_mov_b32_e32 v23, v18 -; SI-NEXT: v_or_b32_e32 v18, v57, v1 +; SI-NEXT: v_mov_b32_e32 v48, v27 +; SI-NEXT: v_mov_b32_e32 v27, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v24, v34 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v20, v19 +; SI-NEXT: v_mov_b32_e32 v19, v29 +; SI-NEXT: v_or_b32_e32 v29, v30, v1 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s40, s8, s5 ; SI-NEXT: v_readlane_b32 s8, v62, 57 -; SI-NEXT: v_or_b32_e32 v1, s14, v18 +; SI-NEXT: v_or_b32_e32 v31, s14, v29 ; SI-NEXT: s_and_b32 s14, s88, 0xff ; SI-NEXT: s_lshl_b32 s15, s79, 8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v43 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_readlane_b32 s9, v62, 60 ; SI-NEXT: s_or_b32 s14, s14, s15 @@ -198524,15 +198722,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s15, s8, s41 ; SI-NEXT: v_readlane_b32 s8, v62, 58 ; SI-NEXT: s_and_b32 s16, s9, 0xffff -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v26, v24 -; SI-NEXT: v_mov_b32_e32 v24, v19 -; SI-NEXT: v_or_b32_e32 v19, v58, v3 +; SI-NEXT: v_mov_b32_e32 v9, v30 +; SI-NEXT: v_mov_b32_e32 v30, v39 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v23 +; SI-NEXT: v_mov_b32_e32 v23, v33 +; SI-NEXT: v_mov_b32_e32 v33, v5 +; SI-NEXT: v_mov_b32_e32 v5, v14 +; SI-NEXT: v_or_b32_e32 v14, v44, v3 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s36, s16, s56 ; SI-NEXT: s_and_b32 s16, s48, 0xffff -; SI-NEXT: v_or_b32_e32 v3, s14, v19 +; SI-NEXT: v_or_b32_e32 v49, s14, v14 ; SI-NEXT: s_or_b32 s14, s8, s43 ; SI-NEXT: s_and_b32 s8, s13, 0xffff ; SI-NEXT: s_or_b32 s53, s16, vcc_lo @@ -198558,36 +198761,56 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s8, s8, s47 ; SI-NEXT: s_or_b32 s9, s16, s77 ; SI-NEXT: s_or_b32 s16, s23, s78 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v25, v58 ; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 ; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 ; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 -; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 +; SI-NEXT: v_alignbit_b32 v51, s14, v54, 16 +; SI-NEXT: v_alignbit_b32 v1, s13, v55, 16 +; SI-NEXT: v_alignbit_b32 v3, s12, v40, 16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 -; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16 +; SI-NEXT: v_alignbit_b32 v42, s36, v59, 16 ; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16 -; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16 +; SI-NEXT: v_alignbit_b32 v40, s94, v2, 16 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v7 +; SI-NEXT: v_mov_b32_e32 v7, v8 +; SI-NEXT: v_mov_b32_e32 v8, v10 +; SI-NEXT: v_mov_b32_e32 v10, v15 +; SI-NEXT: v_mov_b32_e32 v15, v17 +; SI-NEXT: v_mov_b32_e32 v17, v11 ; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16 ; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16 -; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v22 -; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16 -; SI-NEXT: v_mov_b32_e32 v18, v23 -; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16 -; SI-NEXT: v_mov_b32_e32 v19, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 +; SI-NEXT: v_alignbit_b32 v53, s11, v13, 16 +; SI-NEXT: v_mov_b32_e32 v13, v18 +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v32, v22 +; SI-NEXT: v_mov_b32_e32 v22, v35 +; SI-NEXT: v_mov_b32_e32 v35, v25 +; SI-NEXT: v_mov_b32_e32 v25, v38 +; SI-NEXT: v_mov_b32_e32 v38, v28 +; SI-NEXT: v_alignbit_b32 v52, s9, v29, 16 +; SI-NEXT: v_mov_b32_e32 v29, v19 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: v_mov_b32_e32 v20, v34 +; SI-NEXT: v_mov_b32_e32 v34, v24 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_mov_b32_e32 v27, v48 +; SI-NEXT: v_alignbit_b32 v28, s16, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v5, v33 +; SI-NEXT: v_mov_b32_e32 v33, v23 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v26, v39 +; SI-NEXT: v_mov_b32_e32 v39, v30 +; SI-NEXT: v_mov_b32_e32 v30, v9 +; SI-NEXT: v_mov_b32_e32 v9, v12 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v16, v50 ; SI-NEXT: s_lshr_b32 s73, s63, 16 ; SI-NEXT: s_lshr_b32 s72, s5, 16 ; SI-NEXT: s_lshr_b32 s63, s41, 16 @@ -198627,84 +198850,118 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_mov_b32 s96, s4 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s4, s88, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_add_i32 s4, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 10 ; SI-NEXT: v_readlane_b32 s6, v62, 9 -; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_add_i32 s8, s6, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_add_i32 s70, s70, 3 +; SI-NEXT: v_readlane_b32 s6, v62, 30 +; SI-NEXT: s_add_i32 s9, s77, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s79, s92, 3 +; SI-NEXT: s_add_i32 s52, s98, 3 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v37 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v27, v4 +; SI-NEXT: s_add_i32 s30, s87, 3 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: s_add_i32 s50, s90, 3 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v35 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v25, v8 +; SI-NEXT: s_add_i32 s94, s86, 3 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v9, v24, v9 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v33 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v23, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v22, v11 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readlane_b32 s5, v62, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s70, s70, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 30 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s70, 0xff ; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v39 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v39, v2 +; SI-NEXT: v_or_b32_e32 v2, v30, v2 ; SI-NEXT: v_or_b32_e32 v2, s5, v2 ; SI-NEXT: s_add_i32 s5, s35, 3 ; SI-NEXT: v_readlane_b32 s6, v62, 13 ; SI-NEXT: s_and_b32 s5, s5, 0xff ; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: s_add_i32 s9, s77, 3 ; SI-NEXT: s_or_b32 s5, s8, s5 ; SI-NEXT: v_readlane_b32 s6, v62, 12 -; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s6, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_add_i32 s79, s92, 3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 ; SI-NEXT: s_add_i32 s16, s4, 0x3000000 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v2 ; SI-NEXT: s_add_i32 s9, s5, 0x3000000 ; SI-NEXT: s_and_b32 s4, s79, 0xff ; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v38 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v2, v46, v2 ; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s4, s24, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s55, 8 @@ -198717,18 +198974,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s52, s98, 3 ; SI-NEXT: s_add_i32 s11, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s52, 0xff ; SI-NEXT: s_lshl_b32 s5, s67, 8 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: s_add_i32 s30, s87, 3 ; SI-NEXT: v_or_b32_e32 v4, s4, v4 ; SI-NEXT: s_and_b32 s4, s30, 0xff ; SI-NEXT: s_lshl_b32 s5, s21, 8 @@ -198743,18 +198994,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s48, s4, 0x3000000 ; SI-NEXT: v_readlane_b32 s4, v62, 42 -; SI-NEXT: v_mov_b32_e32 v22, v30 ; SI-NEXT: s_add_i32 s87, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 39 ; SI-NEXT: s_and_b32 s4, s87, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v35, v7 ; SI-NEXT: v_or_b32_e32 v7, s4, v7 ; SI-NEXT: v_readlane_b32 s4, v62, 32 ; SI-NEXT: s_add_i32 s67, s4, 3 @@ -198769,7 +199015,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s50, s90, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 23 ; SI-NEXT: s_add_i32 s49, s4, 0x3000000 ; SI-NEXT: s_and_b32 s4, s50, 0xff @@ -198777,14 +199022,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s94, s86, 3 ; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_and_b32 s4, s94, 0xff ; SI-NEXT: s_lshl_b32 s5, s38, 8 ; SI-NEXT: s_add_i32 s8, s71, 3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -198800,11 +199041,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v24, v9 ; SI-NEXT: v_or_b32_e32 v9, s4, v9 ; SI-NEXT: v_readlane_b32 s4, v62, 45 ; SI-NEXT: s_add_i32 s98, s4, 3 @@ -198828,11 +199066,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s86, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v10, v59, v10 ; SI-NEXT: v_or_b32_e32 v10, s4, v10 ; SI-NEXT: v_readlane_b32 s4, v62, 31 ; SI-NEXT: s_add_i32 s66, s4, 3 @@ -198857,27 +199092,19 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 37 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s71, s22, 3 -; SI-NEXT: s_and_b32 s8, s71, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s35, s99, 3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 ; SI-NEXT: v_or_b32_e32 v11, s4, v11 ; SI-NEXT: v_readlane_b32 s4, v62, 40 ; SI-NEXT: s_add_i32 s85, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 37 ; SI-NEXT: s_and_b32 s4, s85, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s71, s22, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readlane_b32 s5, v62, 33 +; SI-NEXT: s_and_b32 s8, s71, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s8 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -198890,60 +199117,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 47 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_and_b32 s6, s35, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v39, s9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 -; SI-NEXT: v_mov_b32_e32 v27, s48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; SI-NEXT: v_mov_b32_e32 v26, s49 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; SI-NEXT: v_mov_b32_e32 v24, s53 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 -; SI-NEXT: v_mov_b32_e32 v23, s36 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 -; SI-NEXT: v_mov_b32_e32 v22, s8 -; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 -; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 -; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 -; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 -; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 -; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s57, s36, 16 -; SI-NEXT: s_lshr_b32 s56, s53, 16 -; SI-NEXT: s_lshr_b32 s47, s94, 16 -; SI-NEXT: s_lshr_b32 s46, s49, 16 -; SI-NEXT: s_lshr_b32 s45, s48, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s9, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s4, v62, 48 ; SI-NEXT: s_add_i32 s7, s4, 3 +; SI-NEXT: v_readlane_b32 s5, v62, 47 ; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_add_i32 s35, s99, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s35, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s89, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -198951,8 +199136,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s4, v62, 46 ; SI-NEXT: s_add_i32 s99, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 44 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s99, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -198980,10 +199167,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s4, v62, 34 ; SI-NEXT: s_add_i32 s69, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -198991,6 +199178,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v18, v5 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_readlane_b32 s4, v62, 22 ; SI-NEXT: s_add_i32 s34, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 21 @@ -199010,10 +199202,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s4, v62, 25 ; SI-NEXT: s_add_i32 s51, s4, 3 ; SI-NEXT: v_readlane_b32 s5, v62, 24 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 ; SI-NEXT: s_and_b32 s4, s51, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_addk_i32 s4, 0x300 @@ -199044,19 +199234,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s6, s64, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_add_i32_e32 v49, vcc, 0x3000000, v1 +; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v39, s9 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 +; SI-NEXT: v_mov_b32_e32 v27, s48 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; SI-NEXT: v_mov_b32_e32 v26, s49 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; SI-NEXT: v_mov_b32_e32 v24, s53 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 +; SI-NEXT: v_mov_b32_e32 v23, s36 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v6, v6, v13 ; SI-NEXT: v_or_b32_e32 v6, s4, v6 ; SI-NEXT: s_add_i32 s4, s83, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s6, s64, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readlane_b32 s5, v62, 15 -; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -199124,23 +199330,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s42, s4, 0x3000000 ; SI-NEXT: v_mov_b32_e32 v13, s18 +; SI-NEXT: v_mov_b32_e32 v22, s8 ; SI-NEXT: v_mov_b32_e32 v20, s10 ; SI-NEXT: v_mov_b32_e32 v19, s12 ; SI-NEXT: v_mov_b32_e32 v18, s13 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v45, vcc, 0x3000000, v5 ; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v6 +; SI-NEXT: v_add_i32_e32 v47, vcc, 0x3000000, v6 ; SI-NEXT: v_mov_b32_e32 v6, s15 ; SI-NEXT: v_alignbit_b32 v57, s42, v13, 16 ; SI-NEXT: v_mov_b32_e32 v13, s17 ; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 -; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 +; SI-NEXT: v_alignbit_b32 v56, v6, v47, 16 +; SI-NEXT: v_alignbit_b32 v51, v5, v45, 16 +; SI-NEXT: v_alignbit_b32 v1, v18, v17, 16 +; SI-NEXT: v_alignbit_b32 v3, v19, v16, 16 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 +; SI-NEXT: v_alignbit_b32 v44, v20, v15, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 +; SI-NEXT: v_alignbit_b32 v43, v22, v12, 16 +; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 +; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 +; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 +; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 +; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 +; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 +; SI-NEXT: v_alignbit_b32 v52, v39, v31, 16 +; SI-NEXT: v_alignbit_b32 v28, v30, v49, 16 ; SI-NEXT: s_lshr_b32 s73, s42, 16 ; SI-NEXT: s_lshr_b32 s72, s40, 16 ; SI-NEXT: s_lshr_b32 s63, s15, 16 @@ -199148,8 +199364,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s61, s13, 16 ; SI-NEXT: s_lshr_b32 s60, s12, 16 ; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: s_lshr_b32 s58, s8, 16 +; SI-NEXT: s_lshr_b32 s57, s36, 16 +; SI-NEXT: s_lshr_b32 s56, s53, 16 +; SI-NEXT: s_lshr_b32 s47, s94, 16 +; SI-NEXT: s_lshr_b32 s46, s49, 16 +; SI-NEXT: s_lshr_b32 s45, s48, 16 +; SI-NEXT: s_lshr_b32 s44, s11, 16 +; SI-NEXT: s_lshr_b32 s43, s9, 16 +; SI-NEXT: s_lshr_b32 s41, s16, 16 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 ; SI-NEXT: v_or_b32_e32 v5, s4, v5 ; SI-NEXT: s_and_b32 s4, s42, 0xffff @@ -199173,7 +199399,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v47 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 ; SI-NEXT: s_and_b32 s4, s15, 0xffff @@ -199186,9 +199412,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v51 ; SI-NEXT: s_and_b32 s4, s14, 0xffff ; SI-NEXT: s_lshl_b32 s5, s62, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -199199,9 +199425,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; SI-NEXT: s_and_b32 s4, s13, 0xffff ; SI-NEXT: s_lshl_b32 s5, s61, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -199212,9 +199438,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; SI-NEXT: s_and_b32 s4, s12, 0xffff ; SI-NEXT: s_lshl_b32 s5, s60, 16 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 @@ -199225,7 +199451,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 ; SI-NEXT: s_and_b32 s4, s10, 0xffff @@ -199238,7 +199464,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 ; SI-NEXT: s_and_b32 s4, s8, 0xffff @@ -199327,7 +199553,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: s_and_b32 s4, s9, 0xffff ; SI-NEXT: s_lshl_b32 s5, s43, 16 @@ -199339,9 +199565,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s41, 16 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -199403,26 +199629,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s31, v63, 1 ; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v58 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $sgpr42 @@ -199431,27 +199643,27 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr58 @@ -199479,12 +199691,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $sgpr16 ; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: s_branch .LBB97_2 @@ -199508,22 +199720,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -199549,13 +199761,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -199564,49 +199774,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -199615,34 +199822,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -199652,131 +199860,155 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -199785,225 +200017,205 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -200032,14 +200244,54 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB97_3 -; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB97_3 +; VI-NEXT: .LBB97_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB97_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB97_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -200058,351 +200310,356 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -200415,35 +200672,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: .LBB97_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -200462,39 +200710,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB97_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB97_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX9: ; %bb.0: @@ -200515,31 +200730,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -200549,133 +200769,129 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -200683,148 +200899,149 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -200832,19 +201049,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -200866,272 +201076,291 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB97_3 ; GFX9-NEXT: .LBB97_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB97_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -201144,61 +201373,55 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -201210,8 +201433,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -201228,14 +201449,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201243,9 +201464,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -201255,254 +201476,277 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -201514,33 +201758,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB97_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -205289,36 +205514,88 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 @@ -205327,20 +205604,68 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -205411,7 +205736,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -205422,151 +205746,53 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -205580,143 +205806,148 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v11 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v13 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v14 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v15 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v8 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v5, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v17 ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v46 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 ; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 ; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 ; VI-NEXT: v_mov_b32_e32 v46, v1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 @@ -205739,93 +205970,89 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v31, 3 -; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v55, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; VI-NEXT: v_add_u16_sdwa v54, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v18, v32, v18 ; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; VI-NEXT: v_add_u16_sdwa v38, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v17, v32, v17 ; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 +; VI-NEXT: v_add_u16_sdwa v62, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v20, v32, v20 ; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v62 ; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v19, v32, v19 ; VI-NEXT: v_add_u16_e32 v32, 3, v22 ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 ; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v22, v32, v22 ; VI-NEXT: v_add_u16_e32 v32, 3, v21 ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 ; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v21, v32, v21 ; VI-NEXT: v_add_u16_e32 v32, 3, v24 ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v49, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v24, v32, v24 ; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v23, v32, v23 ; VI-NEXT: v_add_u16_e32 v32, 3, v26 ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v32, v26 ; VI-NEXT: v_add_u16_e32 v32, 3, v25 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v25, v32, v25 ; VI-NEXT: v_add_u16_e32 v32, 3, v28 ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v28, v32, v28 ; VI-NEXT: v_add_u16_e32 v32, 3, v27 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v32, v27 ; VI-NEXT: v_add_u16_e32 v33, 3, v30 ; VI-NEXT: v_add_u16_e32 v34, 3, v29 ; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 ; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v30, v33, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 ; VI-NEXT: v_add_u16_e32 v33, 3, v37 ; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v29, v34, v29 ; VI-NEXT: v_add_u16_e32 v34, 3, v36 ; VI-NEXT: v_or_b32_e32 v37, v33, v32 ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 ; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v36, v34, v32 ; VI-NEXT: v_add_u16_e32 v33, 3, v2 ; VI-NEXT: v_add_u16_e32 v34, 3, v1 @@ -205834,9 +206061,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v33, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v1, v34, v1 ; VI-NEXT: v_add_u16_e32 v33, 3, v4 ; VI-NEXT: v_add_u16_e32 v34, 3, v3 @@ -205845,9 +206072,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v4, v33, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 ; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v34, v3 ; VI-NEXT: v_add_u16_e32 v33, 3, v6 ; VI-NEXT: v_add_u16_e32 v34, 3, v5 @@ -205855,203 +206082,206 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 ; VI-NEXT: v_or_b32_e32 v6, v33, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v34, v5 ; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v38, 3, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v40, 3, v8 ; VI-NEXT: v_add_u16_e32 v33, 3, v7 ; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; VI-NEXT: v_or_b32_e32 v8, v38, v7 +; VI-NEXT: v_or_b32_e32 v8, v40, v7 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v7, v33, v7 ; VI-NEXT: v_add_u16_e32 v33, 3, v10 -; VI-NEXT: v_add_u16_e32 v38, 3, v9 +; VI-NEXT: v_add_u16_e32 v40, 3, v9 ; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 ; VI-NEXT: v_or_b32_e32 v10, v33, v9 ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 ; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v38, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v40, v9 ; VI-NEXT: v_add_u16_e32 v33, 3, v12 -; VI-NEXT: v_add_u16_e32 v38, 3, v11 +; VI-NEXT: v_add_u16_e32 v40, 3, v11 ; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v12, v33, v11 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 ; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v38, v11 -; VI-NEXT: v_add_u16_e32 v38, 3, v14 -; VI-NEXT: v_add_u16_e32 v49, 3, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v40, v11 +; VI-NEXT: v_add_u16_e32 v40, 3, v14 +; VI-NEXT: v_add_u16_e32 v41, 3, v13 ; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 ; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v38, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v14, v40, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 ; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v16, v16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v16 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v13, v49, v13 +; VI-NEXT: v_or_b32_e32 v13, v41, v13 ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 ; VI-NEXT: v_mov_b32_e32 v46, v35 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v51, v49 ; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v51 -; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_mov_b32_e32 v53, v38 +; VI-NEXT: v_mov_b32_e32 v38, v55 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 ; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v55, v31 ; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206061,22 +206291,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206087,28 +206317,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206116,28 +206346,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206145,28 +206375,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206174,28 +206404,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206203,28 +206433,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206232,76 +206462,79 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 @@ -206309,8 +206542,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206320,9 +206553,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206333,22 +206566,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206359,15 +206593,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206375,9 +206609,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206388,15 +206622,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206404,29 +206638,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206434,21 +206668,21 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206461,18 +206695,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -208710,8 +208943,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 @@ -208722,14 +208955,14 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 @@ -208785,15 +209018,15 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s85, v5 ; SI-NEXT: v_readfirstlane_b32 s81, v2 ; SI-NEXT: v_readfirstlane_b32 s82, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_writelane_b32 v62, s6, 0 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s88, v36 -; SI-NEXT: v_readfirstlane_b32 s18, v37 +; SI-NEXT: v_readfirstlane_b32 s88, v32 +; SI-NEXT: v_readfirstlane_b32 s18, v34 ; SI-NEXT: v_readfirstlane_b32 s78, v38 ; SI-NEXT: v_readfirstlane_b32 s79, v39 ; SI-NEXT: v_readfirstlane_b32 s76, v48 @@ -208808,20 +209041,17 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v40 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v41 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v42 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v55 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -208831,10 +209061,12 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s57, s4, s5 ; SI-NEXT: v_mov_b32_e32 v1, s56 +; SI-NEXT: v_alignbit_b32 v2, s57, v1, 24 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, s57, v1, 16 ; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 ; SI-NEXT: s_or_b32 s46, s4, s5 ; SI-NEXT: s_and_b32 s4, s22, 0xffff @@ -208845,43 +209077,47 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s46 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s47, v1, 24 ; SI-NEXT: s_or_b32 s44, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 +; SI-NEXT: v_alignbit_b32 v2, s47, v1, 16 ; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 ; SI-NEXT: s_or_b32 s45, s4, s5 ; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, s45, v1, 24 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 +; SI-NEXT: v_alignbit_b32 v2, s45, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s45, v1, 8 ; SI-NEXT: s_or_b32 s42, s4, s5 ; SI-NEXT: s_and_b32 s4, s82, 0xffff ; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s42 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, s43, v1, 24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, s43, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, s43, v1, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_and_b32 s4, s85, 0xffff ; SI-NEXT: s_lshl_b32 s5, s84, 16 -; SI-NEXT: v_or_b32_e32 v16, v1, v2 +; SI-NEXT: v_or_b32_e32 v16, v1, v25 ; SI-NEXT: s_or_b32 s41, s4, s5 ; SI-NEXT: v_alignbit_b32 v1, s41, v16, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill @@ -208901,7 +209137,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s15, s4, s5 ; SI-NEXT: s_and_b32 s4, s58, 0xffff ; SI-NEXT: s_lshl_b32 s5, s99, 16 -; SI-NEXT: v_or_b32_e32 v14, v1, v4 +; SI-NEXT: v_or_b32_e32 v14, v1, v26 ; SI-NEXT: s_or_b32 s14, s4, s5 ; SI-NEXT: s_and_b32 s4, s61, 0xffff ; SI-NEXT: s_lshl_b32 s5, s98, 16 @@ -208927,7 +209163,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s10, s4, s5 ; SI-NEXT: s_and_b32 s4, s75, 0xffff ; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: v_or_b32_e32 v12, v1, v5 +; SI-NEXT: v_or_b32_e32 v12, v1, v29 ; SI-NEXT: s_or_b32 s9, s4, s5 ; SI-NEXT: s_and_b32 s4, s77, 0xffff ; SI-NEXT: s_lshl_b32 s5, s76, 16 @@ -208951,7 +209187,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: v_writelane_b32 v62, s4, 1 ; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_or_b32_e32 v10, v1, v6 +; SI-NEXT: v_or_b32_e32 v10, v1, v30 ; SI-NEXT: v_writelane_b32 v62, s4, 3 ; SI-NEXT: s_lshr_b32 s4, s9, 8 ; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 @@ -208972,49 +209208,40 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 ; SI-NEXT: v_writelane_b32 v62, s4, 15 ; SI-NEXT: s_and_b32 s4, s72, 0xffff -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v1, v9 +; SI-NEXT: v_or_b32_e32 v8, v1, v32 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: s_and_b32 s4, s74, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v1, v13 +; SI-NEXT: v_or_b32_e32 v5, v1, v34 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: v_writelane_b32 v62, s4, 5 ; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: v_or_b32_e32 v13, v1, v17 +; SI-NEXT: v_or_b32_e32 v13, v1, v28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 ; SI-NEXT: v_writelane_b32 v62, s4, 8 ; SI-NEXT: s_and_b32 s4, s78, 0xffff -; SI-NEXT: v_mov_b32_e32 v26, v9 -; SI-NEXT: v_or_b32_e32 v9, v1, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v1, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_writelane_b32 v62, s4, 11 ; SI-NEXT: s_and_b32 s4, s88, 0xffff -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_or_b32_e32 v6, v1, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v6, v1, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; SI-NEXT: v_writelane_b32 v62, s4, 14 ; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 -; SI-NEXT: v_or_b32_e32 v4, v1, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v4, v1, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: s_bfe_u32 s4, s76, 0x80008 -; SI-NEXT: v_or_b32_e32 v2, v1, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v1, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v37 ; SI-NEXT: v_writelane_b32 v62, s4, 7 ; SI-NEXT: s_bfe_u32 s4, s78, 0x80008 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v50 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v24 ; SI-NEXT: s_lshr_b32 s68, s57, 8 ; SI-NEXT: s_lshr_b32 s65, s47, 8 ; SI-NEXT: s_lshr_b32 s54, s45, 8 @@ -209075,9 +209302,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s88, 16 @@ -209181,58 +209405,64 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s57, s4, 0x30000 ; SI-NEXT: v_mov_b32_e32 v21, s56 ; SI-NEXT: v_alignbit_b32 v22, s57, v21, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v21, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s57, v21, 16 ; SI-NEXT: v_alignbit_b32 v21, s57, v21, 8 ; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v21, s46 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 -; SI-NEXT: s_lshr_b32 s4, s11, 8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 ; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 ; SI-NEXT: v_mov_b32_e32 v21, s44 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: s_lshr_b32 s4, s11, 8 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s45, v21, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 1 ; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 +; SI-NEXT: v_mov_b32_e32 v21, s42 ; SI-NEXT: v_writelane_b32 v62, s4, 2 ; SI-NEXT: s_lshr_b32 s4, s10, 8 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v3, v25, v3 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 3 ; SI-NEXT: s_lshr_b32 s4, s9, 24 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 +; SI-NEXT: v_or_b32_e32 v5, v39, v5 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 ; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 +; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 +; SI-NEXT: v_alignbit_b32 v21, s43, v21, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 4 ; SI-NEXT: s_lshr_b32 s4, s9, 16 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v7, v26, v7 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 ; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 5 ; SI-NEXT: s_lshr_b32 s4, s9, 8 @@ -209246,7 +209476,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 6 ; SI-NEXT: s_lshr_b32 s4, s8, 24 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 +; SI-NEXT: v_or_b32_e32 v5, v38, v5 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -209255,7 +209485,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s4, s8, 16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v11, v29, v11 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 @@ -209270,20 +209500,20 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 9 ; SI-NEXT: s_lshr_b32 s4, s7, 24 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 +; SI-NEXT: v_or_b32_e32 v5, v28, v5 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 10 ; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v35 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 +; SI-NEXT: v_or_b32_e32 v10, v30, v10 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 @@ -209301,23 +209531,23 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 ; SI-NEXT: v_writelane_b32 v62, s4, 12 ; SI-NEXT: s_lshr_b32 s4, s6, 24 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 +; SI-NEXT: v_or_b32_e32 v2, v49, v2 +; SI-NEXT: v_or_b32_e32 v4, v48, v4 +; SI-NEXT: v_or_b32_e32 v5, v34, v5 +; SI-NEXT: v_or_b32_e32 v8, v32, v8 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 ; SI-NEXT: v_writelane_b32 v62, s4, 13 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v35, s6 +; SI-NEXT: v_mov_b32_e32 v37, s6 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; SI-NEXT: v_mov_b32_e32 v34, s7 +; SI-NEXT: v_mov_b32_e32 v36, s7 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; SI-NEXT: v_mov_b32_e32 v33, s8 -; SI-NEXT: v_mov_b32_e32 v32, s9 +; SI-NEXT: v_mov_b32_e32 v35, s8 +; SI-NEXT: v_mov_b32_e32 v33, s9 ; SI-NEXT: v_mov_b32_e32 v20, s10 ; SI-NEXT: v_mov_b32_e32 v17, s11 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 @@ -209346,18 +209576,18 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_alignbit_b32 v17, v20, v9, 24 ; SI-NEXT: v_alignbit_b32 v18, v20, v9, 16 ; SI-NEXT: v_alignbit_b32 v20, v20, v9, 8 -; SI-NEXT: v_alignbit_b32 v59, v32, v6, 24 -; SI-NEXT: v_alignbit_b32 v60, v32, v6, 16 -; SI-NEXT: v_alignbit_b32 v61, v32, v6, 8 -; SI-NEXT: v_alignbit_b32 v46, v33, v4, 24 -; SI-NEXT: v_alignbit_b32 v56, v33, v4, 16 -; SI-NEXT: v_alignbit_b32 v58, v33, v4, 8 -; SI-NEXT: v_alignbit_b32 v55, v34, v2, 24 -; SI-NEXT: v_alignbit_b32 v40, v34, v2, 16 -; SI-NEXT: v_alignbit_b32 v42, v34, v2, 8 -; SI-NEXT: v_alignbit_b32 v52, v35, v1, 24 -; SI-NEXT: v_alignbit_b32 v53, v35, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v35, v1, 8 +; SI-NEXT: v_alignbit_b32 v59, v33, v6, 24 +; SI-NEXT: v_alignbit_b32 v60, v33, v6, 16 +; SI-NEXT: v_alignbit_b32 v61, v33, v6, 8 +; SI-NEXT: v_alignbit_b32 v46, v35, v4, 24 +; SI-NEXT: v_alignbit_b32 v56, v35, v4, 16 +; SI-NEXT: v_alignbit_b32 v58, v35, v4, 8 +; SI-NEXT: v_alignbit_b32 v55, v36, v2, 24 +; SI-NEXT: v_alignbit_b32 v40, v36, v2, 16 +; SI-NEXT: v_alignbit_b32 v42, v36, v2, 8 +; SI-NEXT: v_alignbit_b32 v52, v37, v1, 24 +; SI-NEXT: v_alignbit_b32 v53, v37, v1, 16 +; SI-NEXT: v_alignbit_b32 v54, v37, v1, 8 ; SI-NEXT: s_lshr_b32 s83, s57, 24 ; SI-NEXT: s_lshr_b32 s71, s57, 16 ; SI-NEXT: s_lshr_b32 s68, s57, 8 @@ -209394,10 +209624,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v62, s4, 15 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s56, 0xff ; SI-NEXT: s_lshl_b32 s5, s68, 8 ; SI-NEXT: s_lshl_b32 s16, s83, 24 @@ -209415,6 +209644,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s82, v63, 26 ; SI-NEXT: v_readlane_b32 s81, v63, 25 ; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_or_b32_e32 v3, v7, v3 @@ -209473,65 +209705,64 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_lshl_b32 s16, s67, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s51, v63, 11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s66, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 +; SI-NEXT: s_lshl_b32 s5, s51, 8 +; SI-NEXT: s_lshl_b32 s16, s67, 24 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s55, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_lshl_b32 s16, s64, 24 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s55, 0xff ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v7, v11, v7 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: v_or_b32_e32 v3, v3, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -209541,16 +209772,21 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 ; SI-NEXT: s_and_b32 s4, s41, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s52, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s16, s64, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s16, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshl_b32 s5, s37, 8 ; SI-NEXT: s_lshl_b32 s16, s53, 24 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 ; SI-NEXT: v_readlane_b32 s53, v63, 13 ; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s48, v63, 8 ; SI-NEXT: v_readlane_b32 s37, v63, 5 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) @@ -209910,51 +210146,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v26, v9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 @@ -209962,7 +210153,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; kill: killed $vcc_lo ; SI-NEXT: ; implicit-def: $vcc_lo ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr71 @@ -209974,13 +210164,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr69 ; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr70 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr55 @@ -210030,12 +210218,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; kill: killed $vcc_lo ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 @@ -210062,6 +210244,54 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; kill: killed $vcc_lo +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -217279,1071 +217509,408 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v14 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v15 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB101_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v24, v47 -; SI-NEXT: v_mov_b32_e32 v23, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v26, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_mov_b32_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_mov_b32_e32 v15, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_mov_b32_e32 v51, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_mov_b32_e32 v37, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: s_cbranch_execnz .LBB101_3 -; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v26 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v62 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v48 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v53 -; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s29 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB101_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v49 -; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_mov_b32_e32 v40, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 -; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 -; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 -; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 -; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 -; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 -; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: s_branch .LBB101_3 +; SI-NEXT: .LBB101_2: +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -218369,72 +217936,775 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v26, v57 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v24, v47 -; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_mov_b32_e32 v20, v53 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v40 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB101_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v5, v59 +; SI-NEXT: v_mov_b32_e32 v10, v63 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v63, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v7, v60 +; SI-NEXT: v_mov_b32_e32 v11, v57 +; SI-NEXT: v_mov_b32_e32 v12, v38 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_mov_b32_e32 v37, v55 +; SI-NEXT: v_mov_b32_e32 v15, v36 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v8 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_mov_b32_e32 v60, v41 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: s_cbranch_vccnz .LBB101_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: @@ -220488,6 +220758,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 @@ -220511,26 +220782,149 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -220546,23 +220940,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 @@ -220576,21 +220955,34 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -220615,333 +221007,222 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -220991,110 +221272,99 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 ; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 ; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -221104,8 +221374,13 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -221127,624 +221402,626 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; SI-NEXT: v_mov_b32_e32 v3, v24 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -222018,13 +222295,14 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 @@ -222042,66 +222320,64 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v46, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -222111,17 +222387,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill @@ -222129,737 +222407,717 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v50, v19 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v44 +; SI-NEXT: v_mov_b32_e32 v50, v26 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_mov_b32_e32 v33, v12 -; SI-NEXT: v_mov_b32_e32 v34, v5 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 -; SI-NEXT: v_mov_b32_e32 v44, v18 -; SI-NEXT: v_mov_b32_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: v_mov_b32_e32 v60, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v7, v6 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v50, v26 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v51, v21 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: v_mov_b32_e32 v5, v6 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v9 -; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v61, v2 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v9, v17 ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v43, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v59, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v17, v11 +; SI-NEXT: v_mov_b32_e32 v16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v22, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 ; SI-NEXT: .LBB103_5: ; %end ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -222869,7 +223127,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload @@ -222880,7 +223138,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -222891,7 +223149,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -222902,7 +223160,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -222913,10 +223171,10 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -222924,10 +223182,10 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -222935,47 +223193,47 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -222986,7 +223244,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload @@ -222997,148 +223255,145 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -226954,6 +227209,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v50, v22 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -226970,311 +227226,306 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v50 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v28 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v62 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v27 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s26 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v32 ; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v41 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v57 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v56, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v45, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v44, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s28 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v2 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v46 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v52 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v52, v38 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v13 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_mov_b32_e32 v62, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_mov_b32_e32 v16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v27 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v62 +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v36 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v48 +; SI-NEXT: v_mov_b32_e32 v44, v24 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v7, v6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v41 +; SI-NEXT: v_mov_b32_e32 v56, v51 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v29, v2 +; SI-NEXT: v_mov_b32_e32 v2, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v26 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v49 +; SI-NEXT: v_mov_b32_e32 v23, v17 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v34 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v38, v8 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, v37 +; SI-NEXT: v_mov_b32_e32 v37, v34 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -227298,529 +227549,537 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_mov_b32_e32 v7, v6 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v29, v31 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_mov_b32_e32 v52, v38 +; SI-NEXT: v_mov_b32_e32 v22, v11 +; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_mov_b32_e32 v16, v15 +; SI-NEXT: v_mov_b32_e32 v14, v21 +; SI-NEXT: v_mov_b32_e32 v28, v13 +; SI-NEXT: v_mov_b32_e32 v10, v9 +; SI-NEXT: v_mov_b32_e32 v57, v8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v46, v27 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_mov_b32_e32 v44, v24 +; SI-NEXT: v_mov_b32_e32 v43, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v51 ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; kill: killed $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; kill: killed $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v7 +; SI-NEXT: v_mov_b32_e32 v24, v10 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: v_alignbit_b32 v1, v4, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_alignbit_b32 v1, v5, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: v_alignbit_b32 v1, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; SI-NEXT: v_alignbit_b32 v1, v8, v1, 16 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v22 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v32 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: v_alignbit_b32 v1, v10, v1, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 +; SI-NEXT: v_alignbit_b32 v38, v11, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v52, v12, v1, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12 +; SI-NEXT: v_alignbit_b32 v1, v13, v1, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v46 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v45 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_alignbit_b32 v50, v15, v1, 16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_alignbit_b32 v1, v16, v1, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v51 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: v_alignbit_b32 v39, v19, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v41, v42, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v40, v41, v17, 16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v17, v39 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 ; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 +; SI-NEXT: v_alignbit_b32 v53, v54, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v26, v36, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_alignbit_b32 v27, v35, v1, 16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 +; SI-NEXT: v_alignbit_b32 v62, v23, v1, 16 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 -; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 -; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 -; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_alignbit_b32 v1, v30, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v47, v21, v12, 16 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v12 +; SI-NEXT: v_alignbit_b32 v31, v22, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v12, v11, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v63, v61, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; SI-NEXT: v_alignbit_b32 v60, v59, v7, 16 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 +; SI-NEXT: v_alignbit_b32 v58, v29, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; SI-NEXT: v_alignbit_b32 v20, v19, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 -; SI-NEXT: v_mov_b32_e32 v15, v24 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 -; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 -; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v2, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v18, v5, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 +; SI-NEXT: v_alignbit_b32 v2, v63, v33, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 +; SI-NEXT: v_alignbit_b32 v2, v12, v8, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: v_alignbit_b32 v2, v31, v10, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v2, v47, v32, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v2, v62, v25, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v2, v27, v13, 16 +; SI-NEXT: v_alignbit_b32 v5, v20, v4, 16 +; SI-NEXT: v_alignbit_b32 v4, v58, v37, 16 +; SI-NEXT: v_alignbit_b32 v3, v60, v34, 16 +; SI-NEXT: v_mov_b32_e32 v33, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v2, v26, v14, 16 +; SI-NEXT: v_alignbit_b32 v38, v53, v15, 16 +; SI-NEXT: v_mov_b32_e32 v15, v22 +; SI-NEXT: v_alignbit_b32 v25, v48, v16, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -227829,125 +228088,134 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -227960,14 +228228,14 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -233303,33 +233571,33 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 @@ -233348,33 +233616,32 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v51 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 ; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v37, v40 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 @@ -233388,14 +233655,21 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v61, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: s_cbranch_scc0 .LBB109_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB109_3 -; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB109_3 +; SI-NEXT: .LBB109_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB109_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v41, v2 +; SI-NEXT: v_mov_b32_e32 v2, v48 +; SI-NEXT: s_cbranch_vccnz .LBB109_5 +; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 @@ -233433,7 +233707,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_or_b32_e32 v44, v1, v48 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -233483,54 +233758,53 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v38, v38, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_or_b32_e32 v49, v48, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v52, v48, v51 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 ; SI-NEXT: v_or_b32_e32 v29, v29, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v59 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v47 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_or_b32_e32 v46, v48, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v61 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 ; SI-NEXT: v_or_b32_e32 v57, v48, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v63 @@ -233540,9 +233814,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_or_b32_e32 v60, v48, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 @@ -233550,35 +233823,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_or_b32_e32 v56, v54, v48 ; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v41 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v45, v40, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_or_b32_e32 v7, v41, v55 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 @@ -233589,13 +233854,14 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 @@ -233629,10 +233895,18 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 ; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16 ; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16 -; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16 ; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_or_b32_e32 v7, v41, v55 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -233642,63 +233916,59 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v7, v41, v10 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_or_b32_e32 v7, v41, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_or_b32_e32 v7, v41, v28 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 ; SI-NEXT: v_or_b32_e32 v7, v41, v27 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v7, v41, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_mov_b32_e32 v1, v44 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v44, v7 ; SI-NEXT: v_or_b32_e32 v43, v42, v24 ; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v7, v41, v37 ; SI-NEXT: v_mov_b32_e32 v51, v7 +; SI-NEXT: v_alignbit_b32 v41, v49, v55, 16 ; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16 ; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16 -; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: .LBB109_5: ; %end ; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v48, v4 @@ -233746,8 +234016,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v29, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -233766,7 +234036,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 @@ -233779,7 +234049,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -233805,7 +234075,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -233831,7 +234101,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -233845,7 +234115,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 @@ -233859,10 +234129,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v4, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen @@ -233920,8 +234188,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB109_4: -; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 352b2cb7123b1..ee43eff01ab60 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -13390,42 +13390,46 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -13437,49 +13441,48 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -13487,113 +13490,80 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -13602,6 +13572,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -13610,65 +13581,116 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -13710,100 +13732,58 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -13826,47 +13806,12 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB27_2 ; @@ -13889,142 +13834,121 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -14054,47 +13978,91 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -14134,76 +14102,35 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14224,43 +14151,9 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB27_2 ; @@ -14283,147 +14176,124 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -14453,48 +14323,78 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB27_3 ; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -14518,6 +14418,20 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -14534,76 +14448,35 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14624,43 +14497,9 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB27_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB27_2 ; @@ -28259,42 +28098,46 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -28306,49 +28149,48 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -28356,113 +28198,80 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -28471,6 +28280,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -28479,65 +28289,116 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -28579,100 +28440,58 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -28695,47 +28514,12 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB51_2 ; @@ -28758,142 +28542,121 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -28923,47 +28686,91 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -29003,76 +28810,35 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -29093,43 +28859,9 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB51_2 ; @@ -29152,147 +28884,124 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -29322,48 +29031,78 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -29387,6 +29126,20 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -29403,76 +29156,35 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB51_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -29493,43 +29205,9 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB51_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB51_2 ; @@ -42400,42 +42078,46 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -42447,49 +42129,48 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -42497,113 +42178,80 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -42612,6 +42260,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -42620,65 +42269,116 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -42720,100 +42420,58 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -42836,47 +42494,12 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB71_2 ; @@ -42899,142 +42522,121 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -43064,47 +42666,91 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -43144,76 +42790,35 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -43234,43 +42839,9 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB71_2 ; @@ -43293,147 +42864,124 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -43463,48 +43011,78 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB71_3 ; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -43528,6 +43106,20 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -43544,76 +43136,35 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB71_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -43634,43 +43185,9 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB71_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB71_2 ; @@ -55687,42 +55204,46 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -55734,49 +55255,48 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -55784,113 +55304,80 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -55899,6 +55386,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -55907,65 +55395,116 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB87_3 ; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -56007,100 +55546,58 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -56123,47 +55620,12 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB87_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB87_2 ; @@ -56186,142 +55648,121 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -56351,47 +55792,91 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -56431,76 +55916,35 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -56521,43 +55965,9 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB87_2 ; @@ -56580,147 +55990,124 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -56750,48 +56137,78 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB87_3 ; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -56815,6 +56232,20 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -56831,76 +56262,35 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB87_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -56921,43 +56311,9 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB87_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB87_2 ; @@ -69721,139 +69077,126 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -69862,6 +69205,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -69870,70 +69214,75 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -69961,12 +69310,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -69976,7 +69324,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -69988,10 +69336,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -70001,12 +69349,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -70014,11 +69361,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -70044,22 +69392,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB99_2 ; @@ -70082,244 +69416,228 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB99_3 ; GFX9-NEXT: .LBB99_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -70344,13 +69662,18 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -70359,69 +69682,48 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB99_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -70443,27 +69745,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB99_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB99_2 ; @@ -81898,139 +81180,126 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -82039,6 +81308,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -82047,70 +81317,75 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -82138,12 +81413,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -82153,7 +81427,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -82165,10 +81439,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -82178,12 +81452,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -82191,11 +81464,12 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -82221,22 +81495,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB107_2 ; @@ -82259,244 +81519,228 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB107_3 ; GFX9-NEXT: .LBB107_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -82521,13 +81765,18 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -82536,69 +81785,48 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB107_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -82620,27 +81848,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB107_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB107_2 ; @@ -86513,60 +85721,47 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v46, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v61, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: v_mul_f32_e64 v36, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v31, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB109_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v46 ; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 ; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 ; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 @@ -86579,8 +85774,8 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v61 +; SI-NEXT: v_alignbit_b32 v21, v1, v28, 16 ; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 ; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -86591,10 +85786,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_alignbit_b32 v15, v1, v39, 16 +; SI-NEXT: v_alignbit_b32 v16, v13, v35, 16 ; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -86604,10 +85799,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v62 +; SI-NEXT: v_alignbit_b32 v10, v1, v55, 16 +; SI-NEXT: v_alignbit_b32 v11, v9, v23, 16 ; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -86617,283 +85812,392 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 -; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_alignbit_b32 v6, v1, v48, 16 +; SI-NEXT: v_alignbit_b32 v20, v18, v37, 16 +; SI-NEXT: v_alignbit_b32 v4, v20, v6, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v20, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v51 +; SI-NEXT: v_alignbit_b32 v3, v1, v40, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v20, v6, 8 +; SI-NEXT: v_alignbit_b32 v14, v12, v53, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v3, 24 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v3, 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_alignbit_b32 v2, v1, v45, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v14, v3, 8 +; SI-NEXT: v_alignbit_b32 v8, v7, v43, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 -; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 -; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: v_mov_b32_e32 v26, v42 +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v4, v8, v2, 8 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v47 +; SI-NEXT: v_alignbit_b32 v1, v1, v58, 16 +; SI-NEXT: v_alignbit_b32 v5, v4, v56, 16 +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 24 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v14 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v11 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v34 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v47 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v61 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v62 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v51 ; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: v_mov_b32_e32 v34, v44 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: v_mov_b32_e32 v25, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; SI-NEXT: s_cbranch_execnz .LBB109_3 -; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v5 +; SI-NEXT: s_branch .LBB109_3 +; SI-NEXT: .LBB109_2: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; kill: killed $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: .LBB109_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: v_mov_b32_e32 v63, v22 +; SI-NEXT: s_cbranch_vccnz .LBB109_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v47 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v44 ; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v42 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v42 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 ; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v51, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v51 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 +; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v37 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v35 +; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v13 +; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v62 +; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 +; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v27, v25, v24, 16 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v42 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v5 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v28 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v61 +; SI-NEXT: v_alignbit_b32 v30, v24, v25, 16 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v25, v30, v27, 24 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v30, v27, 16 +; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v30, v27, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v19, v21, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v19, v21, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v19, v21, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v16, v15, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v16, v15, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v16, v15, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v11, v10, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v11, v10, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v25, v11, v10, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v20, v6, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v20, v6, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v14, v3, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v14, v3, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v8, v2, 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v8, v2, 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 +; SI-NEXT: v_alignbit_b32 v25, v8, v2, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 24 +; SI-NEXT: v_lshrrev_b32_e32 v49, 24, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v34 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 16 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v29 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v5, v1, 8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v14 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v20 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 -; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB109_5: ; %end ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v27 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 @@ -86908,27 +86212,32 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v27, v36 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23 -; SI-NEXT: v_or_b32_e32 v33, v33, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v33 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62 -; SI-NEXT: v_or_b32_e32 v27, v27, v30 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 -; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_or_b32_e32 v23, v27, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v50 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v41 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -86938,18 +86247,18 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v24, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v32 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v49 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v17, v21, v17 ; SI-NEXT: v_or_b32_e32 v17, v19, v17 @@ -86974,9 +86283,9 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v25 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v33 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: v_or_b32_e32 v13, v16, v13 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 @@ -87001,111 +86310,139 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v26 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: v_or_b32_e32 v9, v11, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v29 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v6, v6, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v63 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 ; SI-NEXT: v_or_b32_e32 v3, v3, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v6, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -87125,99 +86462,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v34, v44 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: ; VI: ; %bb.0: @@ -92395,139 +91639,126 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -92536,6 +91767,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -92544,70 +91776,75 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -92635,12 +91872,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -92650,7 +91886,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -92662,10 +91898,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -92675,12 +91911,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -92688,11 +91923,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -92718,22 +91954,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB111_2 ; @@ -92756,244 +91978,228 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB111_3 ; GFX9-NEXT: .LBB111_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -93018,13 +92224,18 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -93033,69 +92244,48 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB111_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -93117,27 +92307,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB111_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 152a48bec2636..d7faaa68732d2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -5100,88 +5100,88 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -5191,13 +5191,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5207,60 +5207,60 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -5291,17 +5291,17 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5309,7 +5309,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -5339,66 +5339,64 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: @@ -14442,88 +14440,88 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -14533,13 +14531,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -14549,60 +14547,60 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -14633,17 +14631,17 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -14651,7 +14649,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -14681,66 +14679,64 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: @@ -23043,88 +23039,88 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -23134,13 +23130,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -23150,60 +23146,60 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -23234,17 +23230,17 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -23252,7 +23248,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -23282,66 +23278,64 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: @@ -30765,88 +30759,88 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -30856,13 +30850,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -30872,60 +30866,60 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -30956,17 +30950,17 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -30974,7 +30968,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -31004,66 +30998,64 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: @@ -38404,13 +38396,13 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v54, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -38455,74 +38447,83 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_or_b32_e32 v60, v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_or_b32_e32 v58, v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v60, v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v52 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v52, v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v25 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_or_b32_e32 v57, v33, v23 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v32, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 +; SI-NEXT: v_or_b32_e32 v49, v31, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_or_b32_e32 v35, v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v57, v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v56, v24, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -38544,7 +38545,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -38557,7 +38558,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 ; SI-NEXT: v_or_b32_e32 v59, v2, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -38580,121 +38581,115 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_or_b32_e32 v14, v14, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 ; SI-NEXT: v_or_b32_e32 v20, v20, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v18 ; SI-NEXT: v_or_b32_e32 v19, v19, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 ; SI-NEXT: v_or_b32_e32 v17, v17, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 -; SI-NEXT: v_or_b32_e32 v38, v33, v24 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v38, v31, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v42 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v36 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v37, v35, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_or_b32_e32 v37, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_or_b32_e32 v58, v30, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 -; SI-NEXT: v_or_b32_e32 v51, v33, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; SI-NEXT: v_or_b32_e32 v51, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v50, v24, v33 +; SI-NEXT: v_or_b32_e32 v50, v24, v31 ; SI-NEXT: v_or_b32_e32 v8, v8, v29 ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_or_b32_e32 v6, v6, v55 ; SI-NEXT: v_or_b32_e32 v21, v21, v45 -; SI-NEXT: v_or_b32_e32 v28, v28, v25 -; SI-NEXT: v_or_b32_e32 v27, v27, v46 -; SI-NEXT: v_alignbit_b32 v44, v50, v31, 16 -; SI-NEXT: v_alignbit_b32 v43, v51, v32, 16 +; SI-NEXT: v_alignbit_b32 v44, v50, v27, 16 +; SI-NEXT: v_alignbit_b32 v43, v51, v28, 16 ; SI-NEXT: v_alignbit_b32 v42, v37, v29, 16 +; SI-NEXT: v_mov_b32_e32 v29, v49 ; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 ; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 ; SI-NEXT: v_alignbit_b32 v55, v19, v45, 16 ; SI-NEXT: v_alignbit_b32 v54, v20, v26, 16 ; SI-NEXT: v_alignbit_b32 v26, v14, v25, 16 ; SI-NEXT: v_alignbit_b32 v25, v10, v46, 16 +; SI-NEXT: v_mov_b32_e32 v46, v35 ; SI-NEXT: v_alignbit_b32 v24, v11, v23, 16 ; SI-NEXT: v_alignbit_b32 v23, v5, v22, 16 ; SI-NEXT: v_alignbit_b32 v22, v59, v47, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 -; SI-NEXT: v_or_b32_e32 v8, v8, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v36 -; SI-NEXT: v_or_b32_e32 v8, v8, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 @@ -38744,7 +38739,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 @@ -38756,7 +38751,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v46 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 97d040b545c09..ed67c6b397f11 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -5490,116 +5490,119 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5609,72 +5612,74 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -5684,13 +5689,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -5700,17 +5705,17 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5718,7 +5723,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -5748,85 +5753,87 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: @@ -15715,116 +15722,119 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -15834,72 +15844,74 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -15909,13 +15921,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -15925,17 +15937,17 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -15943,7 +15955,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -15973,85 +15985,87 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: @@ -25141,116 +25155,119 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -25260,72 +25277,74 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -25335,13 +25354,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -25351,17 +25370,17 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -25369,7 +25388,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -25399,85 +25418,87 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: @@ -33661,116 +33682,119 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -33780,72 +33804,74 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -33855,13 +33881,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -33871,17 +33897,17 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -33889,7 +33915,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -33919,85 +33945,87 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: @@ -35773,11 +35801,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_lshr_b32 s40, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 ; SI-NEXT: s_lshr_b32 s40, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s40 ; SI-NEXT: s_lshr_b32 s40, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s40 ; SI-NEXT: s_lshr_b32 s40, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 ; SI-NEXT: s_lshr_b32 s40, s13, 16 @@ -35816,20 +35845,20 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 ; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 @@ -35846,22 +35875,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 ; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 ; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 @@ -35872,41 +35885,54 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 @@ -35915,13 +35941,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 @@ -35935,14 +35965,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 @@ -36028,7 +36056,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 ; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36042,7 +36070,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 ; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36056,7 +36084,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 ; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36070,56 +36098,56 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 ; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -36167,33 +36195,33 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; @@ -42201,23 +42229,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 @@ -42227,6 +42254,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 @@ -42236,35 +42264,35 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v26, v36 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v53 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -42276,38 +42304,46 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_mov_b32_e32 v39, v3 ; SI-NEXT: v_mov_b32_e32 v49, v5 -; SI-NEXT: v_mov_b32_e32 v60, v7 -; SI-NEXT: v_mov_b32_e32 v62, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v61, v8 +; SI-NEXT: v_mov_b32_e32 v63, v4 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v63, v34, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -42319,13 +42355,13 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -42340,8 +42376,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -42355,94 +42390,82 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v2, v33, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_or_b32_e32 v2, v34, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v33, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 -; SI-NEXT: v_or_b32_e32 v62, v35, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_or_b32_e32 v61, v35, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v34 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: v_or_b32_e32 v54, v33, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v60, v34, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v49, v33, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v55 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v49, v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 +; SI-NEXT: v_or_b32_e32 v39, v35, v55 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_or_b32_e32 v22, v22, v29 ; SI-NEXT: v_or_b32_e32 v37, v33, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v35, v34, v27 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 ; SI-NEXT: v_or_b32_e32 v59, v28, v26 -; SI-NEXT: v_or_b32_e32 v39, v35, v55 -; SI-NEXT: v_or_b32_e32 v30, v30, v27 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 @@ -42461,18 +42484,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_or_b32_e32 v31, v25, v57 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v54, v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 -; SI-NEXT: v_or_b32_e32 v53, v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_or_b32_e32 v53, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_or_b32_e32 v48, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v62 ; SI-NEXT: v_or_b32_e32 v9, v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v60 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 ; SI-NEXT: v_or_b32_e32 v12, v12, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v18, v18, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 @@ -42480,8 +42507,8 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v20 ; SI-NEXT: v_or_b32_e32 v15, v15, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_or_b32_e32 v4, v4, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 ; SI-NEXT: v_or_b32_e32 v6, v6, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 @@ -42497,10 +42524,10 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v52, v25, v33 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v50 ; SI-NEXT: v_or_b32_e32 v51, v28, v25 -; SI-NEXT: v_alignbit_b32 v45, v51, v38, 16 +; SI-NEXT: v_alignbit_b32 v45, v51, v30, 16 ; SI-NEXT: v_alignbit_b32 v44, v52, v44, 16 ; SI-NEXT: v_alignbit_b32 v43, v6, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v4, v41, 16 +; SI-NEXT: v_alignbit_b32 v42, v11, v41, 16 ; SI-NEXT: v_alignbit_b32 v41, v15, v46, 16 ; SI-NEXT: v_alignbit_b32 v40, v21, v55, 16 ; SI-NEXT: v_alignbit_b32 v55, v23, v29, 16 @@ -42508,56 +42535,60 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v28, v12, v27, 16 ; SI-NEXT: v_alignbit_b32 v27, v14, v26, 16 ; SI-NEXT: v_alignbit_b32 v26, v9, v56, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v24, 16 -; SI-NEXT: v_alignbit_b32 v24, v54, v57, 16 +; SI-NEXT: v_mov_b32_e32 v56, v35 +; SI-NEXT: v_alignbit_b32 v25, v48, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v53, v57, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42610,7 +42641,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 @@ -42629,7 +42660,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42641,7 +42672,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42652,8 +42683,8 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -42664,7 +42695,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index a43ce77b20631..099642646f8cc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -5900,48 +5900,52 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -5950,87 +5954,87 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -6040,9 +6044,10 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6085,96 +6090,96 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -6198,74 +6203,83 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: @@ -17015,48 +17029,52 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -17065,87 +17083,87 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -17155,9 +17173,10 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -17200,96 +17219,96 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -17313,74 +17332,83 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: @@ -19525,8 +19553,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 @@ -19565,8 +19593,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_mov_b32_e32 v45, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 @@ -27271,48 +27299,52 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -27321,87 +27353,87 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -27411,9 +27443,10 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -27456,96 +27489,96 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -27569,74 +27602,83 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: @@ -36579,48 +36621,52 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -36629,87 +36675,87 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -36719,9 +36765,10 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -36764,96 +36811,96 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -36877,74 +36924,83 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: @@ -37772,7 +37828,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -37784,7 +37839,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -37794,23 +37848,25 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 @@ -37819,31 +37875,31 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 @@ -37875,6 +37931,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 @@ -37882,26 +37939,25 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 @@ -37936,34 +37992,24 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_mov_b32_e32 v42, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 @@ -37977,14 +38023,16 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -38005,13 +38053,14 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -38020,6 +38069,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 @@ -38034,29 +38086,29 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v25 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 @@ -38107,14 +38159,16 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -38123,7 +38177,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -38132,7 +38186,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -38141,7 +38195,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -38150,7 +38204,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -38159,7 +38213,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -38168,7 +38222,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -38178,8 +38232,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38189,8 +38243,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38200,8 +38254,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38211,8 +38265,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38221,7 +38275,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 @@ -38230,7 +38284,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -38239,7 +38293,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 @@ -38248,19 +38302,10 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -38269,7 +38314,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -38278,20 +38323,27 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -38889,17 +38941,17 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s42, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 ; SI-NEXT: s_lshr_b32 s42, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s42 ; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 ; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: s_lshr_b32 s42, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 ; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 ; SI-NEXT: s_lshr_b32 s42, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 ; SI-NEXT: s_lshr_b32 s42, s10, 16 @@ -38907,7 +38959,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_lshr_b32 s42, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 ; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 ; SI-NEXT: s_lshr_b32 s42, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 ; SI-NEXT: s_lshr_b32 s42, s14, 16 @@ -38944,18 +38996,18 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 ; SI-NEXT: s_lshr_b32 s42, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s9 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 @@ -38969,13 +39021,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 @@ -38995,37 +39047,41 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v43 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v42 ; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 ; SI-NEXT: v_add_f64 v[53:54], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 ; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 @@ -39035,20 +39091,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 @@ -39058,20 +39110,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 @@ -39094,9 +39146,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 @@ -39114,14 +39166,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 ; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -39135,7 +39187,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -39218,7 +39270,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -39233,16 +39285,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -39254,35 +39306,35 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -39306,19 +39358,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -39343,25 +39395,25 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: @@ -45958,30 +46010,30 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 @@ -45992,9 +46044,9 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v21, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 @@ -46008,69 +46060,76 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v51, v9 -; SI-NEXT: v_mov_b32_e32 v52, v11 -; SI-NEXT: v_mov_b32_e32 v54, v13 -; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v11 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v48, v5 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v38 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -46085,6 +46144,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -46092,7 +46152,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 @@ -46103,108 +46162,100 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v31 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v38, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: v_or_b32_e32 v5, v38, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v48, v39, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v7, v31, v45 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v44 -; SI-NEXT: v_or_b32_e32 v41, v37, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_or_b32_e32 v9, v37, v45 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v51 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v9, v38, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v54, v39, v43 -; SI-NEXT: v_or_b32_e32 v52, v38, v42 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v41, v39, v43 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_or_b32_e32 v55, v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v51, v25, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v54, v25, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 +; SI-NEXT: v_or_b32_e32 v52, v37, v40 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v48, v37, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 -; SI-NEXT: v_or_b32_e32 v57, v21, v30 +; SI-NEXT: v_or_b32_e32 v51, v21, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v37 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_or_b32_e32 v62, v25, v37 +; SI-NEXT: v_or_b32_e32 v62, v25, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 @@ -46217,56 +46268,61 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v61, v29, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v60 ; SI-NEXT: v_or_b32_e32 v49, v21, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_or_b32_e32 v35, v25, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_or_b32_e32 v37, v25, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_or_b32_e32 v33, v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_or_b32_e32 v39, v29, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v53 -; SI-NEXT: v_or_b32_e32 v36, v21, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v55, v7, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 -; SI-NEXT: v_alignbit_b32 v26, v36, v26, 16 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_or_b32_e32 v34, v21, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 -; SI-NEXT: v_or_b32_e32 v53, v25, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 -; SI-NEXT: v_or_b32_e32 v59, v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_or_b32_e32 v53, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 +; SI-NEXT: v_or_b32_e32 v50, v25, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v35, v13, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 ; SI-NEXT: v_or_b32_e32 v16, v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 ; SI-NEXT: v_or_b32_e32 v15, v15, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v29, v59, v28, 16 -; SI-NEXT: v_alignbit_b32 v28, v53, v27, 16 +; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_alignbit_b32 v27, v55, v60, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 ; SI-NEXT: v_or_b32_e32 v22, v21, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 ; SI-NEXT: v_or_b32_e32 v24, v24, v21 @@ -46274,16 +46330,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 ; SI-NEXT: v_or_b32_e32 v10, v10, v21 -; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 -; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v16, v37, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v21 @@ -46294,14 +46341,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v56, v3, v47, 16 ; SI-NEXT: v_alignbit_b32 v47, v6, v46, 16 ; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v1, v31, 16 -; SI-NEXT: v_alignbit_b32 v21, v24, v38, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16 +; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16 +; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -46309,15 +46364,13 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen @@ -46335,26 +46388,28 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 @@ -46366,7 +46421,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 @@ -46378,7 +46433,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 @@ -46390,7 +46445,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 @@ -46402,7 +46457,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 @@ -46432,8 +46487,8 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -46444,32 +46499,32 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -47169,137 +47224,149 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v47 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v41 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 +; SI-NEXT: v_mov_b32_e32 v61, v57 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v43 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v48 +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v42 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v48, v33, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v40 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v61 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v57, v54, v29 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v54 +; SI-NEXT: v_or_b32_e32 v54, v33, v29 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_or_b32_e32 v4, v4, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v7, v7, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_or_b32_e32 v6, v6, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_or_b32_e32 v13, v13, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; SI-NEXT: v_or_b32_e32 v16, v16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; SI-NEXT: v_or_b32_e32 v15, v15, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v13, v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; SI-NEXT: v_or_b32_e32 v34, v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; SI-NEXT: v_or_b32_e32 v37, v37, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_or_b32_e32 v36, v36, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v33 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -47307,7 +47374,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 @@ -47315,26 +47381,19 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v34, v34, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v37, v37, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v36, v36, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -47343,7 +47402,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 @@ -47351,15 +47409,14 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v51, v51, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 +; SI-NEXT: v_or_b32_e32 v51, v33, v51 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 @@ -47367,22 +47424,21 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v52, v52, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_or_b32_e32 v20, v20, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_or_b32_e32 v27, v27, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v54 +; SI-NEXT: v_or_b32_e32 v52, v52, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; SI-NEXT: v_or_b32_e32 v20, v20, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; SI-NEXT: v_or_b32_e32 v27, v27, v33 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v41 +; SI-NEXT: v_or_b32_e32 v25, v25, v33 ; SI-NEXT: v_or_b32_e32 v28, v28, v47 ; SI-NEXT: v_or_b32_e32 v26, v26, v46 ; SI-NEXT: v_or_b32_e32 v23, v23, v45 -; SI-NEXT: v_or_b32_e32 v22, v22, v33 +; SI-NEXT: v_or_b32_e32 v22, v22, v57 ; SI-NEXT: v_or_b32_e32 v50, v50, v43 -; SI-NEXT: v_or_b32_e32 v48, v48, v42 ; SI-NEXT: v_or_b32_e32 v38, v38, v58 ; SI-NEXT: v_or_b32_e32 v3, v3, v40 ; SI-NEXT: v_or_b32_e32 v18, v18, v55 @@ -47393,7 +47449,8 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: v_alignbit_b32 v56, v25, v47, 16 ; SI-NEXT: v_alignbit_b32 v47, v27, v46, 16 ; SI-NEXT: v_alignbit_b32 v46, v20, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v52, v33, 16 +; SI-NEXT: v_alignbit_b32 v45, v52, v57, 16 +; SI-NEXT: v_mov_b32_e32 v57, v54 ; SI-NEXT: v_alignbit_b32 v44, v51, v43, 16 ; SI-NEXT: v_alignbit_b32 v43, v36, v42, 16 ; SI-NEXT: v_alignbit_b32 v42, v37, v58, 16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f46875076809..7785a9938e1be 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -6300,41 +6300,44 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -6345,8 +6348,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -6358,51 +6361,51 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -6415,17 +6418,17 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -6436,30 +6439,30 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6502,119 +6505,119 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB15_3: ; %end @@ -6637,35 +6640,67 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: @@ -18310,41 +18345,44 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -18355,8 +18393,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -18368,51 +18406,51 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -18425,17 +18463,17 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -18446,30 +18484,30 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -18512,119 +18550,119 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB31_3: ; %end @@ -18647,35 +18685,67 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: @@ -29422,41 +29492,44 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -29467,8 +29540,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -29480,51 +29553,51 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -29537,17 +29610,17 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -29558,30 +29631,30 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -29624,119 +29697,119 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB43_3: ; %end @@ -29759,35 +29832,67 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: @@ -39540,41 +39645,44 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -39585,8 +39693,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -39598,51 +39706,51 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -39655,17 +39763,17 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -39676,30 +39784,30 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -39742,119 +39850,119 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB51_3: ; %end @@ -39877,35 +39985,67 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: @@ -40737,6 +40877,10 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -40777,16 +40921,14 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -40824,55 +40966,53 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -40899,30 +41039,30 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -40931,33 +41071,32 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v59 -; SI-NEXT: v_mov_b32_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_mov_b32_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v63 -; SI-NEXT: v_mov_b32_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 -; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 +; SI-NEXT: v_mov_b32_e32 v60, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v63, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 -; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -40978,32 +41117,24 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -41011,8 +41142,9 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 @@ -41032,15 +41164,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -41067,15 +41200,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 @@ -41095,22 +41229,22 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -41119,7 +41253,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 @@ -41238,7 +41372,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -41247,7 +41381,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -41256,7 +41390,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -41292,16 +41426,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 @@ -41310,7 +41444,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -41319,47 +41453,47 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -41370,19 +41504,19 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -42027,29 +42161,28 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s44, s5, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s44 ; SI-NEXT: s_lshr_b32 s44, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s44 ; SI-NEXT: s_lshr_b32 s44, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 ; SI-NEXT: s_lshr_b32 s44, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 ; SI-NEXT: s_lshr_b32 s44, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 ; SI-NEXT: s_lshr_b32 s44, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 ; SI-NEXT: s_lshr_b32 s44, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 ; SI-NEXT: s_lshr_b32 s44, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s44 ; SI-NEXT: s_lshr_b32 s44, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 ; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 ; SI-NEXT: s_lshr_b32 s44, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 ; SI-NEXT: s_lshr_b32 s44, s41, 16 @@ -42087,15 +42220,17 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_lshr_b32 s44, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 ; SI-NEXT: s_lshr_b32 s44, s16, 16 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 ; SI-NEXT: s_waitcnt expcnt(0) @@ -42146,9 +42281,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 @@ -42156,17 +42291,17 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 ; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 @@ -42174,7 +42309,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 @@ -42183,37 +42318,35 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 @@ -42221,244 +42354,244 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_mov_b32_e32 v59, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v55, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v42 -; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v54 -; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -42521,27 +42654,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 2c03113e8af47..9b4539c0bb4de 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -103,55 +103,47 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 ; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] -; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] +; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 -; CHECK-NEXT: s_mov_b32 s16, 0 -; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v2, s16 -; CHECK-NEXT: v_mov_b32_e32 v3, s17 -; CHECK-NEXT: s_mov_b32 s18, s16 -; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39] -; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41] -; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43] -; CHECK-NEXT: v_readlane_b32 s36, v6, 0 -; CHECK-NEXT: v_readlane_b32 s44, v6, 8 -; CHECK-NEXT: v_readlane_b32 s45, v6, 9 -; CHECK-NEXT: v_readlane_b32 s46, v6, 10 -; CHECK-NEXT: v_readlane_b32 s47, v6, 11 -; CHECK-NEXT: v_readlane_b32 s48, v6, 12 -; CHECK-NEXT: v_readlane_b32 s49, v6, 13 -; CHECK-NEXT: v_readlane_b32 s50, v6, 14 -; CHECK-NEXT: v_readlane_b32 s51, v6, 15 -; CHECK-NEXT: v_readlane_b32 s37, v6, 1 -; CHECK-NEXT: v_readlane_b32 s38, v6, 2 -; CHECK-NEXT: v_readlane_b32 s39, v6, 3 -; CHECK-NEXT: v_readlane_b32 s40, v6, 4 -; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s42, v6, 6 -; CHECK-NEXT: v_readlane_b32 s43, v6, 7 +; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s45, s44 +; CHECK-NEXT: v_mov_b32_e32 v2, s44 +; CHECK-NEXT: v_mov_b32_e32 v3, s45 +; CHECK-NEXT: s_mov_b32 s46, s44 +; CHECK-NEXT: s_mov_b32 s47, s44 +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[44:47] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s4, v6, 0 +; CHECK-NEXT: v_readlane_b32 s12, v6, 8 +; CHECK-NEXT: v_readlane_b32 s13, v6, 9 +; CHECK-NEXT: v_readlane_b32 s14, v6, 10 +; CHECK-NEXT: v_readlane_b32 s15, v6, 11 +; CHECK-NEXT: v_readlane_b32 s16, v6, 12 +; CHECK-NEXT: v_readlane_b32 s17, v6, 13 +; CHECK-NEXT: v_readlane_b32 s18, v6, 14 +; CHECK-NEXT: v_readlane_b32 s19, v6, 15 +; CHECK-NEXT: v_readlane_b32 s5, v6, 1 +; CHECK-NEXT: v_readlane_b32 s6, v6, 2 +; CHECK-NEXT: v_readlane_b32 s7, v6, 3 +; CHECK-NEXT: v_readlane_b32 s8, v6, 4 +; CHECK-NEXT: v_readlane_b32 s9, v6, 5 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[12:19], s[24:27] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13] -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: v_readlane_b32 s10, v6, 6 +; CHECK-NEXT: v_readlane_b32 s11, v6, 7 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[44:47], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[44:47], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 @@ -179,7 +171,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: .LBB0_9: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] ; CHECK-NEXT: v_readlane_b32 s71, v5, 21 ; CHECK-NEXT: v_readlane_b32 s70, v5, 20 ; CHECK-NEXT: v_readlane_b32 s69, v5, 19 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..f93e5f06beff9 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10159,14 +10159,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s3, 8 +; GFX8-NEXT: s_lshr_b32 s4, s3, 8 ; GFX8-NEXT: s_lshr_b32 s48, s3, 15 -; GFX8-NEXT: v_writelane_b32 v62, s0, 0 +; GFX8-NEXT: v_writelane_b32 v62, s4, 0 ; GFX8-NEXT: s_lshr_b32 s74, s3, 30 ; GFX8-NEXT: s_lshr_b32 s30, s3, 31 ; GFX8-NEXT: s_lshr_b32 s72, s3, 28 @@ -10186,11 +10186,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 ; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: v_writelane_b32 v62, s1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 9 +; GFX8-NEXT: v_writelane_b32 v62, s5, 1 +; GFX8-NEXT: s_lshr_b32 s4, s3, 9 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: v_writelane_b32 v62, s4, 2 ; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 @@ -10213,8 +10213,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v34, s48 ; GFX8-NEXT: s_lshr_b32 s48, s2, 1 ; GFX8-NEXT: s_lshr_b32 s50, s3, 13 -; GFX8-NEXT: v_writelane_b32 v62, s1, 3 -; GFX8-NEXT: s_lshr_b32 s6, s3, 6 +; GFX8-NEXT: v_writelane_b32 v62, s5, 3 +; GFX8-NEXT: s_lshr_b32 s8, s3, 6 ; GFX8-NEXT: s_lshr_b32 s10, s3, 7 ; GFX8-NEXT: s_lshr_b32 s12, s3, 4 ; GFX8-NEXT: s_lshr_b32 s14, s3, 5 @@ -10264,8 +10264,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s54, s2, 13 ; GFX8-NEXT: s_lshr_b32 s52, s2, 10 ; GFX8-NEXT: v_mov_b32_e32 v30, s46 -; GFX8-NEXT: s_lshr_b32 s4, s2, 11 -; GFX8-NEXT: s_lshr_b32 s0, s2, 8 +; GFX8-NEXT: s_lshr_b32 s6, s2, 11 +; GFX8-NEXT: s_lshr_b32 s4, s2, 8 ; GFX8-NEXT: s_lshr_b32 s46, s2, 9 ; GFX8-NEXT: s_lshr_b32 s44, s2, 6 ; GFX8-NEXT: s_lshr_b32 s42, s2, 7 @@ -10278,14 +10278,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_writelane_b32 v62, s2, 4 ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 ; GFX8-NEXT: v_readlane_b32 s2, v62, 2 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 3 -; GFX8-NEXT: v_mov_b32_e32 v38, s50 -; GFX8-NEXT: v_mov_b32_e32 v39, s51 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v35, s49 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v13, s73 @@ -10303,8 +10301,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v29, s57 ; GFX8-NEXT: v_mov_b32_e32 v31, s47 ; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v35, s49 ; GFX8-NEXT: v_mov_b32_e32 v37, s63 +; GFX8-NEXT: v_mov_b32_e32 v38, s50 +; GFX8-NEXT: v_mov_b32_e32 v39, s51 ; GFX8-NEXT: v_mov_b32_e32 v41, s55 ; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 @@ -10313,7 +10312,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[6:7], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -10341,83 +10340,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1f0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1b0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x190 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: v_mov_b32_e32 v54, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x180 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: v_mov_b32_e32 v56, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x170 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x160 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x150 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x130 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: s_add_u32 s2, s0, 0x140 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x130 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x120 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s1 -; GFX8-NEXT: v_mov_b32_e32 v18, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x120 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 @@ -10429,165 +10429,165 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 64 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v62, 4 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v62, 5 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_readlane_b32 s3, v62, 5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index f44a0b0ac2c65..bd191a37582c0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3807,53 +3807,64 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 +; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3864,24 +3875,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3905,27 +3908,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 +; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 +; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 +; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32: @@ -3941,58 +3944,67 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 +; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4003,23 +4015,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4043,27 +4048,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> @@ -4844,8 +4849,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) @@ -4855,66 +4860,67 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; VI-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 -; VI-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 +; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 +; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; VI-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4950,22 +4956,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 ; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 +; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 +; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 +; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 +; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32: @@ -4981,69 +4987,69 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 +; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 +; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -5079,22 +5085,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = sext <64 x i16> %load to <64 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4c0ab91b7d622..02f39e25cb447 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -3749,7 +3749,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 @@ -3953,7 +3953,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -4185,8 +4185,12 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 @@ -4198,6 +4202,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4210,376 +4218,346 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v58 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v57, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v98, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v99 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v70 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v53, 8, v66 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v64 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v34 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v28, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v123, v4, 16, v3 -; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(28) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(26) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v26 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v101, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v45, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 +; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v45, v45, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v60, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v90, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v104, 8, v92 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v111, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v110, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 +; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v92, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v94, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v45, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 @@ -4587,190 +4565,184 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 ; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 ; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 ; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 ; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:224 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:208 +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:159 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:157 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:158 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:152 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:146 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:147 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -5207,7 +5179,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 @@ -5232,11 +5204,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -5249,27 +5221,33 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:11 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:9 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8 +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 @@ -5279,13 +5257,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload @@ -12939,7 +12917,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill @@ -13170,9 +13148,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 @@ -13181,584 +13163,557 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v124, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v111 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v110, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v106 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v75 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v45, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v44 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v117, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v96, 8, v97 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v98 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v85 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v80 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v69 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v68 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v66 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v65 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v52 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v34, 8, v36 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v31 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 16, v3 -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v26 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(59) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v13, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v9, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v78, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v86, v77, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v11, 8, v12 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v70, v91, 16, v77 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v6, 8, v8 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v7, 8, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v91, 16, v77 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v0, 8, v91 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v3, 8, v125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v3, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v91, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v125, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v126, 8, v123 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v77, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 +; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:159 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:157 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:158 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:152 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:146 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:147 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:151 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:149 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload @@ -13767,31 +13722,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload @@ -14215,11 +14170,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -14232,35 +14187,37 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:8 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 @@ -14270,13 +14227,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload @@ -14296,7 +14253,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 @@ -14304,7 +14261,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 @@ -14355,55 +14312,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14412,27 +14368,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -14442,13 +14398,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -14457,13 +14413,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -14509,7 +14465,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -14744,7 +14700,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 @@ -14756,8 +14714,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -14778,7 +14738,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14788,7 +14748,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill @@ -14796,7 +14756,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14804,231 +14764,171 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v94, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v45, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v44, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v116, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v114, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v98, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v98, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v97, 8, v99 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v81, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v69, 8, v70 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v54, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v53, 8, v66 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v51, 8, v64 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v38 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v31, 8, v34 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v28, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 @@ -15044,100 +14944,135 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v123, v3, 16, v2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v101, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v43, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v57, 16, v43 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v5, 8, v6 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v57, 16, v43 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v43, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v127 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v90, 8, v78 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v91, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v93, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v43, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -15158,165 +15093,153 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 +; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 +; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 +; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 +; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 +; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 +; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 +; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 +; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 +; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 +; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 +; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 +; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 +; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 +; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 +; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 +; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 +; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 +; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 +; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 +; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 +; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 +; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 +; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 +; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 +; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 +; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 +; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 +; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 +; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 +; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 +; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 +; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 +; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 +; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 +; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 +; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 +; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 +; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 +; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 +; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 +; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 +; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v20 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v25 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 +; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 +; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 +; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 +; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 +; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 +; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 +; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 +; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 +; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 +; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 +; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 +; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 +; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 +; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 +; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v48 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v64 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v81 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v98 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 +; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 +; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 +; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 +; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 +; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 +; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 +; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 +; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 +; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 +; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v114 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v116 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v45 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v88 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload @@ -15325,14 +15248,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: flat_store_byte v[2:3], v125 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload @@ -15767,7 +15692,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 @@ -15792,11 +15717,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 @@ -15809,40 +15734,50 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index 1da55cf535449..cb652db425421 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -91,15 +91,17 @@ body: | bb.0: ; GCN-LABEL: name: test_no_remat_s_mov_b32_vreg_src_short_lr ; GCN: renamable $sgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: renamable $sgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0 %0:sreg_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index 0430c8349f350..1dbeccf7cf984 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -107,7 +107,7 @@ body: | successors: %bb.13(0x80000000) ; GCN-LABEL: bb.7: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -123,7 +123,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -134,7 +134,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 4a5dc8f300af3..d4a8a0d762afd 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -14,16 +14,15 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) { ; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[2:3], 0x100 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 -; CHECK-NEXT: s_mov_b64 s[0:1], 0x100 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s1, s2 -; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40260000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -32,8 +31,8 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40140000 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 +; CHECK-NEXT: s_add_i32 s1, s1, s0 +; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40180000 diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir index 6966c3d8b6d6a..bc8a383a285b2 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir +++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir @@ -36,24 +36,18 @@ body: | ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_AV128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec ; CHECK-NEXT: SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr7 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr7, 0, killed $vgpr3, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr5 - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9 - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr6_vgpr7 ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr4, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index fba27e3d548cf..ee18a426c1b12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2025,7 +2025,8 @@ define @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v0, (a4) # vscale x 64-byte Folded Spill ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v24, v16, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a3, a3, -241 @@ -2312,16 +2312,16 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vsub.vv v24, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vand.vv v8, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsrl.vi v24, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 6bf882fe47fef..52eaa51051631 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -2193,7 +2193,8 @@ define @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vadd.vi v24, v16, -1 ; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsrl.vi v24, v16, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 1 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v0, (a4) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsub.vv v24, v24, v16 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a3, a3, -241 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 2250ab5bd0bbe..b1af4e685c58f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -1945,49 +1945,49 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma @@ -2025,12 +2025,12 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t @@ -2247,14 +2247,14 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: vand.vv v16, v0, v8 ; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v16 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v24, v24 ; RV32-NEXT: vsrl.vi v0, v24, 1 @@ -2265,39 +2265,39 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: vsub.vv v24, v24, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 4112 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -4320,49 +4320,49 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma @@ -4400,12 +4400,12 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t @@ -4622,14 +4622,14 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: vand.vv v16, v0, v8 ; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v16 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v24, v24 ; RV32-NEXT: vsrl.vi v0, v24, 1 @@ -4640,39 +4640,39 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: vsub.vv v24, v24, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 4112 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 94fecbdfde18e..a993ed909d940 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1449,27 +1449,24 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v8, v24, v0.t @@ -1494,14 +1491,17 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill @@ -1515,25 +1515,22 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v8, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb @@ -1543,12 +1540,15 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index bdce00b10e5a7..1922006b8a581 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1810,10 +1810,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vadd.vi v0, v16, -1 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -1825,14 +1825,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsub.vv v24, v0, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -3715,10 +3715,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vadd.vi v0, v16, -1 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -3730,14 +3730,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsub.vv v24, v0, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 306cacb31bdef..06d54fadaeffd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -126,18 +126,18 @@ define void @vmv.v.x_needs_extended(ptr %p, i64 %x) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vs8r.v v16, (a0) ; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vs8r.v v16, (a0) -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 818b882a402ac..bb121416ddec3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -249,42 +249,32 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v24, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v16, v0 +; CHECK-NEXT: vfdiv.vv v24, v24, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v0, v8 +; CHECK-NEXT: vfdiv.vv v16, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -617,42 +607,32 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v24, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0 +; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8 +; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 0a11501905b81..728fa07a7d4e5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -820,6 +820,7 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv1r.v v3, v0 +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a2, fa0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: slli a1, a3, 1 @@ -835,41 +836,33 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t ; CHECK-NEXT: addi a4, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t +; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t ; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t ; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v24, v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v4, v8, v0.t +; CHECK-NEXT: vfncvtbf16.f.f.w v4, v24, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 @@ -879,27 +872,27 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload @@ -1061,81 +1054,87 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a2, fa0 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vmset.m v8 ; CHECK-NEXT: slli a1, a3, 1 ; CHECK-NEXT: srli a3, a3, 2 ; CHECK-NEXT: sub a4, a0, a1 ; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-NEXT: vslidedown.vx v0, v8, a3 ; CHECK-NEXT: sltu a3, a0, a4 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: slli a4, a4, 5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t -; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t ; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v24, v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8, v0.t +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -1143,17 +1142,25 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: vfmadd.vv v24, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2287,6 +2294,7 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a1, a3, 1 @@ -2302,41 +2310,33 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB68_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -2346,27 +2346,27 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload @@ -2540,81 +2540,87 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v8 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB70_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB70_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -2622,17 +2628,25 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -8266,7 +8280,8 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v24, v16, a1, v0.t +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -8277,45 +8292,37 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB282_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -8323,24 +8330,22 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: vmv1r.v v0, v3 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -8348,12 +8353,20 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -8524,13 +8537,17 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v8 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 @@ -8538,88 +8555,98 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB284_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB284_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9534,100 +9561,116 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vxor.vx v24, v8, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v16, a1 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: mv a5, a4 -; ZVFHMIN-NEXT: slli a4, a4, 1 -; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB292_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9673,44 +9716,39 @@ define @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v8, a1, v0.t ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -10804,43 +10846,35 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB302_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -10850,40 +10884,40 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t ; ZVFHMIN-NEXT: vmv.v.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v12, v4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -10919,10 +10953,18 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v8, a1, v0.t ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -10935,41 +10977,33 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2 @@ -10987,7 +11021,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -10998,10 +11035,7 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -11044,9 +11078,12 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma @@ -11064,81 +11101,92 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a2 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB304_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB304_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -11182,42 +11230,37 @@ define @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 ; ZVFHMIN-NEXT: slli a1, a1, 1 @@ -11298,10 +11349,10 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t ; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 @@ -11315,12 +11366,11 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: vmv8r.v v8, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill @@ -11563,21 +11613,22 @@ define @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfmadd_vf_nxv32bf16( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v24, a0 +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vfmadd.vv v24, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -759,65 +771,77 @@ define @vfmadd_vf_nxv32f16( %va, @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: addi sp, sp, -16 ; ZVFH-NEXT: .cfi_def_cfa_offset 16 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 2 -; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: slli a0, a0, 5 ; ZVFH-NEXT: sub sp, sp, a0 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vmv8r.v v16, v8 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFH-NEXT: fmv.x.h a0, fa0 -; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16 ; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 4 +; ZVFH-NEXT: slli a1, a1, 3 +; ZVFH-NEXT: mv a2, a1 +; ZVFH-NEXT: slli a1, a1, 1 +; ZVFH-NEXT: add a1, a1, a2 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24 ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: slli a1, a1, 3 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFH-NEXT: vmv.v.x v24, a0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vmv.v.x v8, a0 +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -433,40 +434,17 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v16, v24, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFH-NEXT: vfmadd.vv v24, v16, v0 ; ZVFH-NEXT: addi a0, sp, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v20 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -474,23 +452,29 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4 -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v16, v8, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v12 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v0, v16, v8 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v0 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 2 -; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: slli a0, a0, 5 ; ZVFH-NEXT: add sp, sp, a0 ; ZVFH-NEXT: .cfi_def_cfa sp, 16 ; ZVFH-NEXT: addi sp, sp, 16 @@ -502,94 +486,80 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v16, v8 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: li a2, 24 +; ZVFHMIN-NEXT: mul a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -973,94 +943,80 @@ define @vfmadd_vf_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va,