From 7be65055b22f89b2dc05883bee571a6c177dca17 Mon Sep 17 00:00:00 2001 From: Yuanke Luo Date: Sun, 16 Nov 2025 16:29:35 +0800 Subject: [PATCH 1/4] [RegAlloc] Relax the split constrain on MMB prolog https://reviews.llvm.org/D52052 is to prevent register split on the MBB which have prolog instructions defining the exec register (or mask register that activate the threads of a warp in GPU). The constrain seems too strict, because 1) If the split is allowed, it may fit the free live range of a physical register, and no spill will happen; 2) The register class of register that is under splitting may not be the same to the register that is defined in prolog, so there is no interference with the register being defined in prolog. The current code has another small issue. The MBB->getFirstNonDebugInstr() just skip debug instructions, but SA->getFirstSplitPoint(Number) would skip label and phi instructions. This cause some MBB with label instruction being taken as prolog. This patch is to relax the split constrain on MMB with prolog by checking if the register defined in prolog has the common register class with the register being split. It allow the split if the register defined in prolog is physical register or there is no common register class. --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 9 +- llvm/lib/CodeGen/SplitKit.cpp | 28 + llvm/lib/CodeGen/SplitKit.h | 8 + .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 5369 +++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 187 +- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 293 +- .../ran-out-of-sgprs-allocation-failure.mir | 62 +- .../test/CodeGen/AMDGPU/spill-before-exec.mir | 2 +- .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 537 +- .../CodeGen/X86/2008-04-17-CoalescerBug.ll | 78 +- 10 files changed, 3328 insertions(+), 3245 deletions(-) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 1bc7607890328..a6a65b444b466 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -773,8 +773,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf, // Abort if the spill cannot be inserted at the MBB' start if (((BC.Entry == SpillPlacement::MustSpill) || (BC.Entry == SpillPlacement::PrefSpill)) && - SlotIndex::isEarlierInstr(BI.FirstInstr, - SA->getFirstSplitPoint(BC.Number))) + !SA->canSplitBeforeProlog(BC.Number)) return false; } @@ -829,11 +828,7 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf, BCS[B].Number = Number; // Abort if the spill cannot be inserted at the MBB' start - MachineBasicBlock *MBB = MF->getBlockNumbered(Number); - auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr(); - if (FirstNonDebugInstr != MBB->end() && - SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr), - SA->getFirstSplitPoint(Number))) + if (!SA->canSplitBeforeProlog(Number)) return false; // Interference for the live-in value. if (Intf.first() <= Indexes->getMBBStartIdx(Number)) diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index 8ec4bfbb5a330..f87c5f43ccaaa 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -147,6 +147,34 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI, return LIS.getInstructionFromIndex(LIP); } +bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI, + const MachineBasicBlock &MBB) { + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + + for (auto &MI : MBB) { + if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() || + MI.isPseudoProbe()) + continue; + + if (!TII->isBasicBlockPrologue(MI)) + return true; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual()) + continue; + + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); + const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg()); + if (TRI->getCommonSubClass(RC, CurRC)) + return false; + } + } + + return true; +} + //===----------------------------------------------------------------------===// // Split Analysis //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h index de255911268f2..a9fc921534d0e 100644 --- a/llvm/lib/CodeGen/SplitKit.h +++ b/llvm/lib/CodeGen/SplitKit.h @@ -89,6 +89,9 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis { return Res; } + /// Return true if we can split \pCurLI before \pMBB's prolog. + bool canSplitBeforeProlog(const LiveInterval &CurLI, + const MachineBasicBlock &MBB); }; /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting @@ -247,6 +250,11 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis { SlotIndex getFirstSplitPoint(unsigned Num) { return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num)); } + + bool canSplitBeforeProlog(unsigned Num) { + MachineBasicBlock *BB = MF.getBlockNumbered(Num); + return IPA.canSplitBeforeProlog(*CurLI, *BB); + } }; /// SplitEditor - Edit machine code and LiveIntervals for live range diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d3ebd92f0677b..5ff30224f87ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -154717,13 +154717,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s73, s21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v44, s19, 0 ; SI-NEXT: v_writelane_b32 v44, s18, 1 ; SI-NEXT: v_writelane_b32 v44, s17, 2 ; SI-NEXT: v_writelane_b32 v44, s16, 3 -; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 ; SI-NEXT: v_writelane_b32 v41, s35, 3 @@ -154747,9 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s74, s29 -; SI-NEXT: s_mov_b32 s78, s28 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: s_mov_b32 s57, s28 +; SI-NEXT: s_mov_b32 s47, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -154759,7 +154758,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 @@ -154769,95 +154767,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_writelane_b32 v42, s89, 0 +; SI-NEXT: v_readfirstlane_b32 s91, v10 +; SI-NEXT: v_writelane_b32 v42, s90, 1 +; SI-NEXT: v_readfirstlane_b32 s92, v8 +; SI-NEXT: v_writelane_b32 v42, s91, 2 +; SI-NEXT: v_readfirstlane_b32 s93, v7 +; SI-NEXT: v_writelane_b32 v42, s92, 3 +; SI-NEXT: v_readfirstlane_b32 s94, v13 +; SI-NEXT: v_writelane_b32 v42, s93, 4 +; SI-NEXT: v_readfirstlane_b32 s95, v14 +; SI-NEXT: v_writelane_b32 v42, s94, 5 +; SI-NEXT: v_writelane_b32 v42, s95, 6 +; SI-NEXT: v_readfirstlane_b32 s30, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v18 +; SI-NEXT: v_readfirstlane_b32 s34, v16 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s36, v21 ; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: v_writelane_b32 v43, s37, 0 ; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v43, s38, 1 ; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v43, s39, 2 ; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v43, s48, 3 ; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v43, s49, 4 ; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v43, s50, 5 ; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v43, s51, 6 ; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v43, s52, 7 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v43, s53, 8 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v43, s54, 9 -; SI-NEXT: v_writelane_b32 v43, s55, 10 -; SI-NEXT: s_mov_b32 s57, s24 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v44, s4, 4 +; SI-NEXT: v_writelane_b32 v44, s4, 5 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v44, s4, 5 -; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v44, s4, 6 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v44, s4, 7 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 8 +; SI-NEXT: v_writelane_b32 v44, s4, 9 ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v44, s4, 9 +; SI-NEXT: v_writelane_b32 v44, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 10 +; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s54, v28 +; SI-NEXT: v_readfirstlane_b32 s55, v27 +; SI-NEXT: s_mov_b32 s6, s23 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_mov_b32 s58, s26 +; SI-NEXT: s_mov_b32 s40, s25 +; SI-NEXT: s_mov_b32 s25, s24 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 ; SI-NEXT: v_readfirstlane_b32 s77, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s26, v53 +; SI-NEXT: v_readfirstlane_b32 s46, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s61, v55 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s24, v40 +; SI-NEXT: v_readfirstlane_b32 s62, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -154867,40 +154871,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v44, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s21, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v44, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s61, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s43, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s42, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -154908,45 +154923,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s88, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s79, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v44, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v44, s4, 19 +; SI-NEXT: v_writelane_b32 v44, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 20 +; SI-NEXT: v_writelane_b32 v44, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v44, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s43, v37 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 36 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 23 +; SI-NEXT: v_writelane_b32 v44, s4, 37 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 24 +; SI-NEXT: v_writelane_b32 v44, s4, 38 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 25 +; SI-NEXT: v_writelane_b32 v44, s4, 39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 26 +; SI-NEXT: v_writelane_b32 v44, s4, 40 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 27 +; SI-NEXT: v_writelane_b32 v44, s4, 41 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v44, s4, 28 +; SI-NEXT: v_writelane_b32 v44, s4, 42 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -154962,41 +154979,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v44, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 43 +; SI-NEXT: v_writelane_b32 v44, s22, 44 +; SI-NEXT: v_writelane_b32 v44, s6, 45 +; SI-NEXT: v_writelane_b32 v44, s23, 46 +; SI-NEXT: v_writelane_b32 v44, s20, 47 +; SI-NEXT: v_writelane_b32 v44, s58, 48 +; SI-NEXT: v_writelane_b32 v44, s47, 49 +; SI-NEXT: v_writelane_b32 v44, s40, 50 +; SI-NEXT: v_writelane_b32 v44, s25, 51 +; SI-NEXT: v_writelane_b32 v44, s29, 52 +; SI-NEXT: v_writelane_b32 v44, s57, 53 +; SI-NEXT: v_writelane_b32 v44, s62, 54 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v44, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v44, s4, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v44, s4, 32 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v44, s4, 33 -; SI-NEXT: v_writelane_b32 v44, s22, 34 -; SI-NEXT: v_writelane_b32 v44, s23, 35 -; SI-NEXT: v_writelane_b32 v44, s73, 36 -; SI-NEXT: v_writelane_b32 v44, s20, 37 -; SI-NEXT: v_writelane_b32 v44, s47, 38 -; SI-NEXT: v_writelane_b32 v44, s76, 39 -; SI-NEXT: v_writelane_b32 v44, s25, 40 -; SI-NEXT: v_writelane_b32 v44, s57, 41 -; SI-NEXT: v_writelane_b32 v44, s74, 42 -; SI-NEXT: v_writelane_b32 v44, s78, 43 -; SI-NEXT: v_writelane_b32 v44, s24, 44 -; SI-NEXT: v_writelane_b32 v44, s16, 45 -; SI-NEXT: v_writelane_b32 v44, s17, 46 -; SI-NEXT: v_writelane_b32 v44, s18, 47 -; SI-NEXT: v_writelane_b32 v44, s19, 48 -; SI-NEXT: v_writelane_b32 v44, s77, 49 -; SI-NEXT: v_writelane_b32 v44, s89, 50 -; SI-NEXT: v_writelane_b32 v44, s90, 51 -; SI-NEXT: v_writelane_b32 v44, s91, 52 -; SI-NEXT: v_writelane_b32 v44, s92, 53 -; SI-NEXT: v_writelane_b32 v44, s93, 54 -; SI-NEXT: v_writelane_b32 v44, s94, 55 -; SI-NEXT: v_writelane_b32 v44, s95, 56 +; SI-NEXT: v_readfirstlane_b32 s21, v52 +; SI-NEXT: v_writelane_b32 v44, s61, 55 +; SI-NEXT: v_writelane_b32 v44, s21, 56 +; SI-NEXT: v_writelane_b32 v44, s26, 57 +; SI-NEXT: v_writelane_b32 v44, s46, 58 +; SI-NEXT: v_writelane_b32 v44, s16, 59 +; SI-NEXT: v_writelane_b32 v44, s17, 60 +; SI-NEXT: v_writelane_b32 v44, s18, 61 +; SI-NEXT: v_writelane_b32 v44, s19, 62 +; SI-NEXT: v_writelane_b32 v44, s77, 63 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s13, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -155004,7 +155011,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s29, v36 +; SI-NEXT: v_readfirstlane_b32 s56, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -155035,17 +155042,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 ; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 -; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 -; SI-NEXT: v_writelane_b32 v44, s30, 59 -; SI-NEXT: v_writelane_b32 v44, s31, 60 -; SI-NEXT: v_writelane_b32 v44, s34, 61 -; SI-NEXT: v_writelane_b32 v44, s35, 62 -; SI-NEXT: v_writelane_b32 v44, s36, 63 +; SI-NEXT: v_writelane_b32 v42, vcc_lo, 7 +; SI-NEXT: v_writelane_b32 v42, vcc_hi, 8 +; SI-NEXT: v_writelane_b32 v42, s30, 9 +; SI-NEXT: v_writelane_b32 v42, s31, 10 +; SI-NEXT: v_writelane_b32 v42, s34, 11 +; SI-NEXT: v_writelane_b32 v42, s35, 12 +; SI-NEXT: v_writelane_b32 v42, s36, 13 +; SI-NEXT: v_writelane_b32 v42, s37, 14 +; SI-NEXT: v_writelane_b32 v42, s38, 15 +; SI-NEXT: v_writelane_b32 v42, s39, 16 +; SI-NEXT: v_writelane_b32 v42, s48, 17 +; SI-NEXT: v_writelane_b32 v42, s49, 18 +; SI-NEXT: v_writelane_b32 v42, s50, 19 +; SI-NEXT: v_writelane_b32 v42, s51, 20 +; SI-NEXT: v_writelane_b32 v42, s52, 21 +; SI-NEXT: v_writelane_b32 v42, s53, 22 +; SI-NEXT: v_writelane_b32 v42, s54, 23 +; SI-NEXT: v_writelane_b32 v42, s55, 24 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s60, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s62, v32 +; SI-NEXT: v_readfirstlane_b32 s63, v32 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -155069,7 +155087,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: v_readfirstlane_b32 s75, v48 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s15, v49 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -155103,48 +155121,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s65, v48 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v43, s64, 11 +; SI-NEXT: v_writelane_b32 v42, s64, 25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v43, s65, 12 +; SI-NEXT: v_writelane_b32 v42, s65, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v43, s67, 13 -; SI-NEXT: v_writelane_b32 v43, s84, 14 -; SI-NEXT: v_writelane_b32 v43, s85, 15 -; SI-NEXT: v_writelane_b32 v43, s86, 16 -; SI-NEXT: v_writelane_b32 v43, s87, 17 -; SI-NEXT: v_writelane_b32 v43, s8, 18 -; SI-NEXT: v_writelane_b32 v43, s99, 19 -; SI-NEXT: v_writelane_b32 v43, s12, 20 -; SI-NEXT: v_writelane_b32 v43, s44, 21 -; SI-NEXT: v_writelane_b32 v43, s97, 22 -; SI-NEXT: v_writelane_b32 v43, s15, 23 -; SI-NEXT: v_writelane_b32 v43, s96, 24 -; SI-NEXT: v_writelane_b32 v43, s98, 25 -; SI-NEXT: v_writelane_b32 v43, s83, 26 -; SI-NEXT: v_writelane_b32 v43, s82, 27 -; SI-NEXT: v_writelane_b32 v43, s9, 28 -; SI-NEXT: v_writelane_b32 v43, s81, 29 -; SI-NEXT: v_writelane_b32 v43, s80, 30 -; SI-NEXT: v_writelane_b32 v43, s7, 31 -; SI-NEXT: v_writelane_b32 v43, s72, 32 -; SI-NEXT: v_writelane_b32 v43, s26, 33 -; SI-NEXT: v_writelane_b32 v43, s41, 34 -; SI-NEXT: v_writelane_b32 v43, s14, 35 -; SI-NEXT: v_writelane_b32 v43, s69, 36 -; SI-NEXT: v_writelane_b32 v43, s71, 37 -; SI-NEXT: v_writelane_b32 v43, s70, 38 -; SI-NEXT: v_writelane_b32 v43, s68, 39 -; SI-NEXT: v_writelane_b32 v43, s60, 40 -; SI-NEXT: v_writelane_b32 v43, s62, 41 -; SI-NEXT: v_writelane_b32 v43, s11, 42 -; SI-NEXT: v_writelane_b32 v43, s10, 43 -; SI-NEXT: v_writelane_b32 v43, s58, 44 -; SI-NEXT: v_writelane_b32 v43, s66, 45 -; SI-NEXT: v_writelane_b32 v43, s29, 46 -; SI-NEXT: v_writelane_b32 v43, s28, 47 -; SI-NEXT: v_writelane_b32 v43, s27, 48 +; SI-NEXT: v_writelane_b32 v42, s67, 27 +; SI-NEXT: v_writelane_b32 v42, s84, 28 +; SI-NEXT: v_writelane_b32 v42, s85, 29 +; SI-NEXT: v_writelane_b32 v42, s86, 30 +; SI-NEXT: v_writelane_b32 v42, s87, 31 +; SI-NEXT: v_writelane_b32 v42, s8, 32 +; SI-NEXT: v_writelane_b32 v42, s99, 33 +; SI-NEXT: v_writelane_b32 v42, s12, 34 +; SI-NEXT: v_writelane_b32 v42, s44, 35 +; SI-NEXT: v_writelane_b32 v42, s97, 36 +; SI-NEXT: v_writelane_b32 v42, s15, 37 +; SI-NEXT: v_writelane_b32 v42, s96, 38 +; SI-NEXT: v_writelane_b32 v42, s98, 39 +; SI-NEXT: v_writelane_b32 v42, s83, 40 +; SI-NEXT: v_writelane_b32 v42, s82, 41 +; SI-NEXT: v_writelane_b32 v42, s9, 42 +; SI-NEXT: v_writelane_b32 v42, s81, 43 +; SI-NEXT: v_writelane_b32 v42, s80, 44 +; SI-NEXT: v_writelane_b32 v42, s7, 45 +; SI-NEXT: v_writelane_b32 v42, s72, 46 +; SI-NEXT: v_writelane_b32 v42, s75, 47 +; SI-NEXT: v_writelane_b32 v42, s41, 48 +; SI-NEXT: v_writelane_b32 v42, s14, 49 +; SI-NEXT: v_writelane_b32 v42, s69, 50 +; SI-NEXT: v_writelane_b32 v42, s71, 51 +; SI-NEXT: v_writelane_b32 v42, s70, 52 +; SI-NEXT: v_writelane_b32 v42, s68, 53 +; SI-NEXT: v_writelane_b32 v42, s59, 54 +; SI-NEXT: v_writelane_b32 v42, s63, 55 +; SI-NEXT: v_writelane_b32 v42, s11, 56 +; SI-NEXT: v_writelane_b32 v42, s10, 57 +; SI-NEXT: v_writelane_b32 v42, s13, 58 +; SI-NEXT: v_writelane_b32 v42, s66, 59 +; SI-NEXT: v_writelane_b32 v42, s56, 60 +; SI-NEXT: v_writelane_b32 v42, s28, 61 +; SI-NEXT: v_writelane_b32 v42, s27, 62 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_readlane_b32 s4, v44, 3 @@ -155153,107 +155171,97 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 58 +; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: v_readlane_b32 s5, v44, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 59 +; SI-NEXT: s_or_b32 s45, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s24, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 60 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s42, s6, s5 +; SI-NEXT: s_and_b32 s5, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 61 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s6, s40, 24 +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_and_b32 s5, s58, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 62 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s78, 0xff -; SI-NEXT: s_lshl_b32 s6, s74, 8 +; SI-NEXT: s_lshl_b32 s6, s47, 24 +; SI-NEXT: s_or_b32 s73, s6, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 63 -; SI-NEXT: s_or_b32 s4, s16, s6 +; SI-NEXT: s_or_b32 s25, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s77, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 0 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 1 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s76, s16, s6 +; SI-NEXT: s_or_b32 s23, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s77, s17, s16 +; SI-NEXT: s_or_b32 s76, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s25, s17, s16 +; SI-NEXT: s_or_b32 s22, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: s_or_b32 s88, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s30, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s78, s18, s17 +; SI-NEXT: s_or_b32 s74, s18, s17 ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_mov_b32 s31, s88 -; SI-NEXT: s_or_b32 s88, s18, s17 +; SI-NEXT: s_or_b32 s77, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s89, s18, s17 +; SI-NEXT: s_or_b32 s79, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_writelane_b32 v43, s18, 49 +; SI-NEXT: s_or_b32 s89, s19, s18 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_mov_b32 s73, s79 -; SI-NEXT: s_or_b32 s79, s19, s18 +; SI-NEXT: s_or_b32 s78, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v42, s18, 63 ; SI-NEXT: s_and_b32 s18, s84, 0xff ; SI-NEXT: s_lshl_b32 s19, s67, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s95, s20, s19 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s19, 0 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -155261,49 +155269,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s85, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s8, 2 +; SI-NEXT: s_or_b32 s8, s20, s19 ; SI-NEXT: s_and_b32 s19, s80, 0xff ; SI-NEXT: s_lshl_b32 s20, s9, 8 ; SI-NEXT: s_or_b32 vcc_lo, s19, s20 ; SI-NEXT: s_and_b32 s19, s44, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s8, 1 +; SI-NEXT: s_or_b32 s8, s20, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s96, 0xff +; SI-NEXT: v_writelane_b32 v43, s8, 3 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s12, 50 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: v_writelane_b32 v43, s7, 5 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s75, 0xff ; SI-NEXT: s_lshl_b32 s20, s82, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 4 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s81, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 52 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 6 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 54 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s62, 0xff -; SI-NEXT: s_lshl_b32 s20, s60, 8 +; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_and_b32 s19, s63, 0xff +; SI-NEXT: s_lshl_b32 s20, s59, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 53 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 7 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 @@ -155311,185 +155321,185 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 55 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s29, 0xff +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s58, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 56 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s13, 24 +; SI-NEXT: v_writelane_b32 v43, s7, 8 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 57 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s24, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 33 +; SI-NEXT: s_or_b32 s47, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 32 -; SI-NEXT: s_or_b32 s10, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 31 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v44, 30 +; SI-NEXT: s_lshl_b32 s20, s61, 24 +; SI-NEXT: s_or_b32 s56, s20, s19 +; SI-NEXT: s_and_b32 s19, s46, 0xff +; SI-NEXT: s_lshl_b32 s20, s26, 8 +; SI-NEXT: v_writelane_b32 v43, s7, 9 ; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 29 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 43 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 28 -; SI-NEXT: s_or_b32 s47, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 27 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 42 +; SI-NEXT: s_or_b32 s61, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 41 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 26 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 25 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 40 +; SI-NEXT: s_or_b32 s46, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 39 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 24 -; SI-NEXT: s_or_b32 s24, s20, s19 -; SI-NEXT: s_mov_b32 s92, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 23 -; SI-NEXT: s_mov_b32 s36, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 8 -; SI-NEXT: v_readlane_b32 s11, v44, 22 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 38 +; SI-NEXT: s_or_b32 s62, s20, s19 +; SI-NEXT: s_mov_b32 s92, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 37 +; SI-NEXT: s_mov_b32 s37, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 36 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_mov_b32 s62, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 21 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: s_mov_b32 s30, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s30, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 20 -; SI-NEXT: s_or_b32 s58, s20, s19 -; SI-NEXT: s_mov_b32 s91, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 19 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 35 +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_mov_b32 s91, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 34 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s35, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 18 -; SI-NEXT: s_mov_b32 s4, s46 -; SI-NEXT: s_or_b32 s46, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: s_mov_b32 s36, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: s_mov_b32 s39, s43 +; SI-NEXT: s_or_b32 s43, s20, s19 +; SI-NEXT: s_mov_b32 s53, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 32 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 -; SI-NEXT: s_mov_b32 s52, s73 -; SI-NEXT: s_or_b32 s73, s20, s19 -; SI-NEXT: s_and_b32 s19, s31, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_mov_b32 s49, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 31 +; SI-NEXT: s_or_b32 s65, s20, s19 +; SI-NEXT: s_mov_b32 s90, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 30 +; SI-NEXT: s_mov_b32 s54, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s42, 24 +; SI-NEXT: s_mov_b32 s50, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 27 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_or_b32 s42, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_mov_b32 s38, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 25 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_mov_b32 s48, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 23 ; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s61, 8 -; SI-NEXT: v_readlane_b32 s93, v44, 17 +; SI-NEXT: s_mov_b32 s63, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 22 +; SI-NEXT: s_mov_b32 s52, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 21 ; SI-NEXT: s_or_b32 s27, s19, s20 -; SI-NEXT: s_and_b32 s19, s40, 0xff +; SI-NEXT: s_mov_b32 s51, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 20 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_mov_b32 s55, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 19 ; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_mov_b32 s93, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 18 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 16 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_mov_b32 s75, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 17 ; SI-NEXT: s_or_b32 s69, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 16 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 14 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: s_mov_b32 s75, s94 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 14 ; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 13 ; SI-NEXT: s_or_b32 s29, s19, s20 ; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 11 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: s_mov_b32 s45, s9 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 10 -; SI-NEXT: s_mov_b32 s38, s11 +; SI-NEXT: v_readlane_b32 s7, v44, 11 ; SI-NEXT: s_or_b32 s11, s20, s19 ; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 10 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s82, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 9 ; SI-NEXT: s_or_b32 s80, s20, s19 ; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 8 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 6 -; SI-NEXT: s_mov_b32 s90, s31 +; SI-NEXT: v_readlane_b32 s7, v44, 7 ; SI-NEXT: s_or_b32 s31, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 6 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 4 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s43, s93 -; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: v_readlane_b32 s7, v44, 5 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s34, s4 +; SI-NEXT: v_readlane_b32 s8, v44, 4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v43, 60 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s63, s95 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s61, s8 -; SI-NEXT: s_mov_b32 s60, s40 +; SI-NEXT: s_lshl_b32 s20, s8, 24 ; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 +; SI-NEXT: s_mov_b32 s7, s8 ; SI-NEXT: s_or_b32 s15, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s95, s5, 16 -; SI-NEXT: s_lshl_b32 s22, s6, 16 +; SI-NEXT: s_lshl_b32 s20, s24, 16 +; SI-NEXT: s_lshl_b32 s35, s5, 16 +; SI-NEXT: s_lshl_b32 s95, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -155500,16 +155510,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v43, 58 +; SI-NEXT: v_readlane_b32 s26, v43, 10 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v43, 59 -; SI-NEXT: v_readlane_b32 s66, v43, 63 +; SI-NEXT: s_mov_b32 s27, s45 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v43, 62 -; SI-NEXT: v_readlane_b32 s65, v43, 61 -; SI-NEXT: v_readlane_b32 s64, v42, 0 +; SI-NEXT: s_mov_b32 s29, s40 +; SI-NEXT: s_mov_b32 s24, s42 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: v_readlane_b32 s21, v42, 1 +; SI-NEXT: s_mov_b32 s21, s4 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -155550,7 +155558,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s39, 8 +; SI-NEXT: s_lshl_b32 s17, s75, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -155560,13 +155568,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s50, 3 +; SI-NEXT: s_add_i32 s17, s63, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s49, 8 -; SI-NEXT: s_add_i32 s19, s60, 3 +; SI-NEXT: s_lshl_b32 s18, s52, 8 +; SI-NEXT: s_add_i32 s19, s51, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s43, 24 +; SI-NEXT: s_lshl_b32 s18, s55, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 @@ -155574,11 +155582,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s37, 8 +; SI-NEXT: s_lshl_b32 s19, s38, 8 ; SI-NEXT: s_add_i32 s20, s48, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s51, 24 +; SI-NEXT: s_lshl_b32 s19, s59, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 @@ -155586,11 +155594,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s19, s90, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s55, 8 -; SI-NEXT: s_add_i32 s22, s54, 3 +; SI-NEXT: s_lshl_b32 s20, s54, 8 +; SI-NEXT: s_add_i32 s22, s13, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s53, 24 +; SI-NEXT: s_lshl_b32 s20, s50, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 @@ -155598,11 +155606,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_add_i32 s20, s91, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s35, 8 -; SI-NEXT: s_add_i32 s23, s38, 3 +; SI-NEXT: s_lshl_b32 s22, s36, 8 +; SI-NEXT: s_add_i32 s23, s53, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_lshl_b32 s22, s52, 24 +; SI-NEXT: s_lshl_b32 s22, s49, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 @@ -155610,93 +155618,93 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s92, 3 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s36, 8 -; SI-NEXT: s_add_i32 s60, s62, 3 +; SI-NEXT: s_lshl_b32 s23, s37, 8 +; SI-NEXT: s_add_i32 s60, s30, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_and_b32 s60, s60, 0xff -; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s23, s39, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 42 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 41 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 25 +; SI-NEXT: v_readlane_b32 s7, v44, 39 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 40 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 58 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 57 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 43 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 56 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 48 +; SI-NEXT: v_readlane_b32 s7, v42, 62 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 47 +; SI-NEXT: v_readlane_b32 s7, v42, 61 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 55 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 44 +; SI-NEXT: v_readlane_b32 s7, v44, 54 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 46 +; SI-NEXT: v_readlane_b32 s7, v42, 60 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 45 +; SI-NEXT: v_readlane_b32 s7, v42, 59 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 44 +; SI-NEXT: v_readlane_b32 s7, v42, 58 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 43 +; SI-NEXT: v_readlane_b32 s7, v42, 57 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 42 +; SI-NEXT: v_readlane_b32 s7, v42, 56 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: v_readlane_b32 s7, v42, 53 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 36 +; SI-NEXT: v_readlane_b32 s7, v42, 50 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 35 +; SI-NEXT: v_readlane_b32 s7, v42, 49 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s7, v42, 55 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 40 +; SI-NEXT: v_readlane_b32 s7, v42, 54 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 38 +; SI-NEXT: v_readlane_b32 s7, v42, 52 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 37 +; SI-NEXT: v_readlane_b32 s7, v42, 51 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -155711,15 +155719,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v43, 32 +; SI-NEXT: v_readlane_b32 s6, v42, 46 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v42, 43 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v42, 40 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v42, 39 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -155727,47 +155735,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 33 +; SI-NEXT: v_readlane_b32 s6, v42, 47 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v42, 41 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 17 +; SI-NEXT: v_readlane_b32 s7, v42, 31 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 19 +; SI-NEXT: v_readlane_b32 s7, v42, 33 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 34 +; SI-NEXT: v_readlane_b32 s7, v42, 48 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v42, 45 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v42, 37 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 38 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v42, 44 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 28 -; SI-NEXT: v_readlane_b32 s11, v43, 21 +; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s11, v42, 35 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: v_readlane_b32 s9, v42, 36 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -155775,15 +155783,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v42, 34 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 18 -; SI-NEXT: v_readlane_b32 s12, v43, 15 +; SI-NEXT: v_readlane_b32 s11, v42, 32 +; SI-NEXT: v_readlane_b32 s12, v42, 29 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 16 +; SI-NEXT: v_readlane_b32 s11, v42, 30 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -155791,15 +155799,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 14 +; SI-NEXT: v_readlane_b32 s11, v42, 28 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v43, 13 -; SI-NEXT: v_readlane_b32 s13, v43, 11 +; SI-NEXT: v_readlane_b32 s12, v42, 27 +; SI-NEXT: v_readlane_b32 s13, v42, 25 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 12 +; SI-NEXT: v_readlane_b32 s12, v42, 26 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -155807,16 +155815,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 10 +; SI-NEXT: v_readlane_b32 s12, v42, 24 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 9 -; SI-NEXT: v_readlane_b32 s16, v43, 7 +; SI-NEXT: v_readlane_b32 s13, v42, 23 +; SI-NEXT: v_readlane_b32 s16, v42, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 8 +; SI-NEXT: v_readlane_b32 s13, v42, 22 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -155824,16 +155832,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 6 +; SI-NEXT: v_readlane_b32 s13, v42, 20 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 5 -; SI-NEXT: v_readlane_b32 s17, v43, 3 +; SI-NEXT: v_readlane_b32 s16, v42, 19 +; SI-NEXT: v_readlane_b32 s17, v42, 17 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 18 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -155841,16 +155849,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 2 +; SI-NEXT: v_readlane_b32 s16, v42, 16 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 1 -; SI-NEXT: v_readlane_b32 s18, v44, 63 +; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: v_readlane_b32 s18, v42, 13 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s17, v42, 14 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -155859,16 +155867,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v44, 62 +; SI-NEXT: v_readlane_b32 s16, v42, 12 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v44, 61 -; SI-NEXT: v_readlane_b32 s19, v44, 59 +; SI-NEXT: v_readlane_b32 s18, v42, 11 +; SI-NEXT: v_readlane_b32 s19, v42, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 60 +; SI-NEXT: v_readlane_b32 s18, v42, 10 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -155876,16 +155884,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 58 +; SI-NEXT: v_readlane_b32 s18, v42, 8 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v44, 57 -; SI-NEXT: v_readlane_b32 s20, v44, 55 +; SI-NEXT: v_readlane_b32 s19, v42, 7 +; SI-NEXT: v_readlane_b32 s20, v42, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 56 +; SI-NEXT: v_readlane_b32 s19, v42, 6 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -155893,15 +155901,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 54 +; SI-NEXT: v_readlane_b32 s19, v42, 4 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v44, 53 -; SI-NEXT: v_readlane_b32 s21, v44, 51 +; SI-NEXT: v_readlane_b32 s20, v42, 3 +; SI-NEXT: v_readlane_b32 s21, v42, 1 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 52 +; SI-NEXT: v_readlane_b32 s20, v42, 2 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -155909,16 +155917,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 50 +; SI-NEXT: v_readlane_b32 s20, v42, 0 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v44, 49 -; SI-NEXT: v_readlane_b32 s22, v44, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 63 +; SI-NEXT: v_readlane_b32 s22, v44, 61 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v44, 48 +; SI-NEXT: v_readlane_b32 s21, v44, 62 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -155927,16 +155935,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 53 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v44, 42 -; SI-NEXT: v_readlane_b32 s23, v44, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 52 +; SI-NEXT: v_readlane_b32 s23, v44, 59 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v44, 46 +; SI-NEXT: v_readlane_b32 s22, v44, 60 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -155945,15 +155953,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 51 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v44, 40 -; SI-NEXT: v_readlane_b32 s24, v44, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 50 +; SI-NEXT: v_readlane_b32 s24, v44, 48 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v44, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 49 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -155962,15 +155970,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 47 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v44, 36 -; SI-NEXT: v_readlane_b32 s25, v44, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 46 +; SI-NEXT: v_readlane_b32 s25, v44, 44 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 45 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 @@ -155988,100 +155996,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s24, s25, s24 ; SI-NEXT: v_readlane_b32 s25, v44, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s17, 16 -; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_and_b32 s76, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s19, 16 +; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s13, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s12, 16 +; SI-NEXT: s_and_b32 s12, s11, 0xffff0000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 49 -; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 +; SI-NEXT: s_and_b32 s25, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s22, 16 +; SI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s12, 0 ; SI-NEXT: s_lshl_b32 s18, s11, 16 ; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 -; SI-NEXT: s_and_b32 s46, s46, 0xff -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s11, 50 -; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s11, 1 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s9, 2 ; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s17, 16 +; SI-NEXT: v_writelane_b32 v43, s9, 3 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s7, s10, 0xffff0000 ; SI-NEXT: s_lshl_b32 s46, s46, 16 ; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 4 +; SI-NEXT: s_lshl_b32 s7, s10, 16 ; SI-NEXT: s_or_b32 s46, s47, s46 ; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: v_writelane_b32 v43, s7, 52 -; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 5 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 ; SI-NEXT: s_or_b32 s56, s46, s47 ; SI-NEXT: s_add_i32 s47, s58, 0x3000000 ; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: v_writelane_b32 v43, s7, 6 ; SI-NEXT: s_lshl_b32 s7, s8, 16 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 54 -; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 55 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 7 +; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s22, 16 -; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s23, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s74, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s12, 16 -; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s10, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s60, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 8 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 56 +; SI-NEXT: s_and_b32 s58, s57, 0xffff0000 ; SI-NEXT: s_lshl_b32 s57, s57, 16 ; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 ; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s47, 16 -; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_and_b32 s61, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s62, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s45, 16 +; SI-NEXT: s_and_b32 s64, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_lshl_b32 s66, s41, 16 ; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 ; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 @@ -156092,99 +156101,102 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s80, s5, 16 ; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 57 +; SI-NEXT: v_writelane_b32 v42, s13, 63 +; SI-NEXT: v_writelane_b32 v43, s7, 9 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v43, 49 +; SI-NEXT: v_readlane_b32 s4, v42, 63 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 50 +; SI-NEXT: v_readlane_b32 s4, v43, 1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 51 +; SI-NEXT: v_readlane_b32 s4, v43, 3 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156192,14 +156204,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 4 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 52 +; SI-NEXT: v_readlane_b32 s4, v43, 6 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156207,16 +156221,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_readlane_b32 s4, v43, 7 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156224,15 +156237,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 57 +; SI-NEXT: v_readlane_b32 s4, v43, 9 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156244,35 +156256,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156286,7 +156298,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156371,52 +156383,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 +; SI-NEXT: v_readlane_b32 s92, v44, 38 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_readlane_b32 s91, v44, 35 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s90, v44, 31 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s37, v44, 37 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s92, v44, 24 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: v_readlane_b32 s91, v44, 20 -; SI-NEXT: s_mov_b32 s90, s88 -; SI-NEXT: v_readlane_b32 s36, v44, 23 -; SI-NEXT: v_readlane_b32 s35, v44, 19 -; SI-NEXT: v_readlane_b32 s62, v44, 22 -; SI-NEXT: v_readlane_b32 s38, v44, 18 -; SI-NEXT: s_mov_b32 s34, s46 -; SI-NEXT: s_mov_b32 s93, s21 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: v_readlane_b32 s72, v44, 10 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: v_readlane_b32 s30, v44, 21 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s52, s79 -; SI-NEXT: v_readlane_b32 s98, v44, 6 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: v_readlane_b32 s43, v44, 17 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: v_readlane_b32 s41, v44, 14 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: v_readlane_b32 s14, v44, 13 -; SI-NEXT: v_readlane_b32 s44, v44, 5 -; SI-NEXT: v_readlane_b32 s9, v44, 11 -; SI-NEXT: v_readlane_b32 s81, v44, 12 -; SI-NEXT: v_readlane_b32 s82, v44, 9 -; SI-NEXT: v_readlane_b32 s10, v44, 16 -; SI-NEXT: v_readlane_b32 s12, v44, 4 -; SI-NEXT: v_readlane_b32 s96, v44, 7 -; SI-NEXT: v_readlane_b32 s83, v44, 8 -; SI-NEXT: v_readlane_b32 s71, v44, 15 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s36, v44, 34 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_readlane_b32 s30, v44, 36 +; SI-NEXT: v_readlane_b32 s53, v44, 33 +; SI-NEXT: v_readlane_b32 s34, v44, 27 +; SI-NEXT: v_readlane_b32 s93, v44, 19 +; SI-NEXT: v_readlane_b32 s38, v44, 26 +; SI-NEXT: v_readlane_b32 s75, v44, 18 +; SI-NEXT: v_readlane_b32 s72, v44, 11 +; SI-NEXT: v_readlane_b32 s63, v44, 23 +; SI-NEXT: v_readlane_b32 s59, v44, 24 +; SI-NEXT: v_readlane_b32 s48, v44, 25 +; SI-NEXT: s_mov_b32 s39, s43 +; SI-NEXT: v_readlane_b32 s52, v44, 22 +; SI-NEXT: v_readlane_b32 s49, v44, 32 +; SI-NEXT: v_readlane_b32 s98, v44, 7 +; SI-NEXT: v_readlane_b32 s54, v44, 30 +; SI-NEXT: v_readlane_b32 s55, v44, 20 +; SI-NEXT: v_readlane_b32 s51, v44, 21 +; SI-NEXT: v_readlane_b32 s41, v44, 15 +; SI-NEXT: v_readlane_b32 s50, v44, 28 +; SI-NEXT: v_readlane_b32 s13, v44, 29 +; SI-NEXT: v_readlane_b32 s14, v44, 14 +; SI-NEXT: v_readlane_b32 s44, v44, 6 +; SI-NEXT: v_readlane_b32 s9, v44, 12 +; SI-NEXT: v_readlane_b32 s81, v44, 13 +; SI-NEXT: v_readlane_b32 s82, v44, 10 +; SI-NEXT: v_readlane_b32 s10, v44, 17 +; SI-NEXT: v_readlane_b32 s7, v44, 4 +; SI-NEXT: v_readlane_b32 s12, v44, 5 +; SI-NEXT: v_readlane_b32 s96, v44, 8 +; SI-NEXT: v_readlane_b32 s83, v44, 9 +; SI-NEXT: v_readlane_b32 s71, v44, 16 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 @@ -156425,50 +156441,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; kill: killed $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr70 @@ -182467,12 +182481,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v61, s29, 0 -; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -182506,59 +182515,58 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: s_mov_b32 s10, s16 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: s_mov_b32 s50, s29 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_readfirstlane_b32 s44, v22 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 +; SI-NEXT: v_readfirstlane_b32 s73, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 -; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: v_writelane_b32 v62, s44, 0 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_writelane_b32 v62, s73, 1 +; SI-NEXT: s_mov_b32 s60, s20 +; SI-NEXT: v_readfirstlane_b32 s69, v23 +; SI-NEXT: v_writelane_b32 v62, s74, 2 +; SI-NEXT: v_readfirstlane_b32 s11, v26 +; SI-NEXT: v_writelane_b32 v62, s69, 3 ; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_writelane_b32 v62, s11, 4 ; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 +; SI-NEXT: v_writelane_b32 v62, s14, 5 +; SI-NEXT: s_mov_b32 s68, s24 ; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_writelane_b32 v62, s46, 6 +; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 -; SI-NEXT: s_mov_b32 s60, s20 -; SI-NEXT: s_mov_b32 s63, s24 +; SI-NEXT: v_writelane_b32 v62, s56, 7 +; SI-NEXT: v_writelane_b32 v62, s57, 8 ; SI-NEXT: v_readfirstlane_b32 s95, v3 ; SI-NEXT: v_readfirstlane_b32 s31, v5 ; SI-NEXT: v_readfirstlane_b32 s24, v9 ; SI-NEXT: v_readfirstlane_b32 s38, v12 ; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v13 ; SI-NEXT: v_readfirstlane_b32 s9, v16 ; SI-NEXT: v_readfirstlane_b32 s79, v15 ; SI-NEXT: v_readfirstlane_b32 s13, v18 -; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_readfirstlane_b32 s42, v20 ; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_readfirstlane_b32 s44, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 +; SI-NEXT: v_readfirstlane_b32 s89, v29 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v13 ; SI-NEXT: v_readfirstlane_b32 s98, v10 ; SI-NEXT: v_readfirstlane_b32 s90, v8 ; SI-NEXT: v_readfirstlane_b32 s88, v7 ; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: v_writelane_b32 v61, s4, 0 ; SI-NEXT: v_readfirstlane_b32 s55, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -182577,389 +182585,374 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s67, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s54, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s65, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s84, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s87, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s96, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s94, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s16, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v61, s4, 33 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s15, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s12, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 -; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 -; SI-NEXT: v_writelane_b32 v61, s18, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 -; SI-NEXT: v_writelane_b32 v61, s22, 44 -; SI-NEXT: v_writelane_b32 v61, s62, 45 -; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: v_writelane_b32 v61, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: v_writelane_b32 v61, s4, 36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: v_writelane_b32 v61, s4, 37 +; SI-NEXT: v_writelane_b32 v61, s17, 38 +; SI-NEXT: v_writelane_b32 v61, s10, 39 +; SI-NEXT: v_writelane_b32 v61, s19, 40 +; SI-NEXT: v_writelane_b32 v61, s18, 41 +; SI-NEXT: v_writelane_b32 v61, s21, 42 +; SI-NEXT: v_writelane_b32 v61, s60, 43 +; SI-NEXT: v_writelane_b32 v61, s23, 44 +; SI-NEXT: v_writelane_b32 v61, s22, 45 +; SI-NEXT: v_writelane_b32 v61, s25, 46 +; SI-NEXT: v_writelane_b32 v61, s68, 47 +; SI-NEXT: v_writelane_b32 v61, s76, 48 +; SI-NEXT: v_writelane_b32 v61, s26, 49 +; SI-NEXT: v_writelane_b32 v61, s50, 50 +; SI-NEXT: v_writelane_b32 v61, s99, 51 +; SI-NEXT: v_writelane_b32 v61, s28, 52 +; SI-NEXT: v_writelane_b32 v61, s95, 53 +; SI-NEXT: v_writelane_b32 v61, s31, 54 +; SI-NEXT: v_writelane_b32 v61, s24, 55 +; SI-NEXT: v_writelane_b32 v61, s38, 56 +; SI-NEXT: v_writelane_b32 v61, s36, 57 +; SI-NEXT: v_writelane_b32 v61, s9, 58 +; SI-NEXT: v_writelane_b32 v61, s79, 59 +; SI-NEXT: v_writelane_b32 v61, s13, 60 +; SI-NEXT: v_writelane_b32 v61, s40, 61 +; SI-NEXT: v_writelane_b32 v61, s42, 62 +; SI-NEXT: v_writelane_b32 v61, s43, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s78, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s92, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s35, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s37, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s63, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s62, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s7, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s93, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: v_writelane_b32 v62, s93, 9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 -; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_writelane_b32 v62, s53, 10 +; SI-NEXT: v_writelane_b32 v62, s66, 11 +; SI-NEXT: v_writelane_b32 v62, s7, 12 +; SI-NEXT: v_writelane_b32 v62, s78, 13 +; SI-NEXT: v_writelane_b32 v62, s77, 14 +; SI-NEXT: v_writelane_b32 v62, s92, 15 +; SI-NEXT: v_writelane_b32 v62, s75, 16 +; SI-NEXT: v_writelane_b32 v62, s37, 17 +; SI-NEXT: v_writelane_b32 v62, s39, 18 +; SI-NEXT: v_writelane_b32 v62, s30, 19 +; SI-NEXT: v_writelane_b32 v62, s48, 20 +; SI-NEXT: v_writelane_b32 v62, s35, 21 +; SI-NEXT: v_writelane_b32 v62, s52, 22 +; SI-NEXT: v_writelane_b32 v62, s64, 23 +; SI-NEXT: s_cbranch_scc0 .LBB93_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 1 -; SI-NEXT: v_readlane_b32 s5, v61, 0 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s99, 0xff ; SI-NEXT: s_lshl_b32 s5, s55, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s31, 0xff @@ -182986,7 +182979,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s9, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_and_b32 s4, s40, 0xff ; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 @@ -182994,231 +182987,230 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_and_b32 s4, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_and_b32 s4, s56, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_and_b32 s4, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s48, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s79, v61, 37 +; SI-NEXT: v_readlane_b32 s57, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s46, v61, 35 +; SI-NEXT: v_readlane_b32 s56, v61, 34 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s43, v61, 33 +; SI-NEXT: v_readlane_b32 s44, v61, 32 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s40, v61, 31 +; SI-NEXT: v_readlane_b32 s42, v61, 30 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s22, v61, 29 +; SI-NEXT: v_readlane_b32 s26, v61, 28 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s28, v61, 27 +; SI-NEXT: v_readlane_b32 s18, v61, 26 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s18, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s14, v61, 25 +; SI-NEXT: v_readlane_b32 s13, v61, 24 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_readlane_b32 s11, v61, 23 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s25, v61, 22 +; SI-NEXT: v_readlane_b32 s10, v61, 21 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s21, v61, 20 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s9, v61, 19 +; SI-NEXT: v_readlane_b32 s7, v61, 18 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_readlane_b32 s68, v61, 17 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s66, v61, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s53, v61, 15 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_and_b32 s4, s53, 0xff ; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_mov_b32 s61, s34 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_readlane_b32 s34, v61, 14 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_and_b32 s4, s34, 0xff ; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s93, v61, 13 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_and_b32 s4, s93, 0xff ; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s86, v61, 27 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_and_b32 s4, s51, 0xff ; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s84, v61, 25 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 ; SI-NEXT: s_and_b32 s4, s87, 0xff ; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: s_and_b32 s4, s82, 0xff ; SI-NEXT: s_lshl_b32 s5, s80, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_and_b32 s4, s83, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 ; SI-NEXT: s_and_b32 s4, s71, 0xff ; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_and_b32 s4, s65, 0xff ; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s50, v61, 12 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_and_b32 s4, s67, 0xff ; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s64, v61, 11 +; SI-NEXT: v_readlane_b32 s52, v61, 10 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_and_b32 s4, s64, 0xff ; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s35, v61, 9 +; SI-NEXT: v_readlane_b32 s48, v61, 8 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_and_b32 s4, s35, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s30, v61, 7 +; SI-NEXT: v_readlane_b32 s39, v61, 6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s30, 0xff ; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s37, v61, 5 +; SI-NEXT: v_readlane_b32 s75, v61, 4 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: s_and_b32 s4, s37, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s76, v61, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s78, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 @@ -183226,24 +183218,151 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b32 s99, s55 ; SI-NEXT: s_mov_b32 s20, s88 ; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s55, s6 ; SI-NEXT: s_mov_b32 s95, s91 ; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: s_cbranch_execnz .LBB93_3 -; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_mov_b32 s36, s8 +; SI-NEXT: s_mov_b32 s38, s27 +; SI-NEXT: s_mov_b32 s6, s7 +; SI-NEXT: s_mov_b32 s8, s9 +; SI-NEXT: s_mov_b32 s7, s10 +; SI-NEXT: s_mov_b32 s98, s89 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s17, s72 +; SI-NEXT: s_mov_b32 s19, s62 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s23, s63 +; SI-NEXT: s_mov_b32 s13, s18 +; SI-NEXT: s_mov_b32 s27, s22 +; SI-NEXT: s_branch .LBB93_3 +; SI-NEXT: .LBB93_2: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s61, s34 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_readlane_b32 s75, v61, 4 +; SI-NEXT: v_readlane_b32 s76, v61, 0 +; SI-NEXT: v_readlane_b32 s77, v61, 2 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s39, v61, 6 +; SI-NEXT: v_readlane_b32 s37, v61, 5 +; SI-NEXT: v_readlane_b32 s30, v61, 7 +; SI-NEXT: v_readlane_b32 s48, v61, 8 +; SI-NEXT: v_readlane_b32 s52, v61, 10 +; SI-NEXT: v_readlane_b32 s35, v61, 9 +; SI-NEXT: v_readlane_b32 s50, v61, 12 +; SI-NEXT: v_readlane_b32 s64, v61, 11 +; SI-NEXT: s_mov_b32 s55, s6 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_mov_b32 s36, s8 +; SI-NEXT: s_mov_b32 s38, s27 +; SI-NEXT: v_readlane_b32 s6, v61, 18 +; SI-NEXT: v_readlane_b32 s93, v61, 13 +; SI-NEXT: v_readlane_b32 s34, v61, 14 +; SI-NEXT: v_readlane_b32 s53, v61, 15 +; SI-NEXT: v_readlane_b32 s66, v61, 16 +; SI-NEXT: v_readlane_b32 s68, v61, 17 +; SI-NEXT: v_readlane_b32 s8, v61, 19 +; SI-NEXT: v_readlane_b32 s21, v61, 20 +; SI-NEXT: v_readlane_b32 s25, v61, 22 +; SI-NEXT: v_readlane_b32 s7, v61, 21 +; SI-NEXT: s_mov_b32 s98, s89 +; SI-NEXT: v_readlane_b32 s9, v61, 23 +; SI-NEXT: s_mov_b32 s17, s72 +; SI-NEXT: s_mov_b32 s19, s62 +; SI-NEXT: v_readlane_b32 s11, v61, 24 +; SI-NEXT: s_mov_b32 s23, s63 +; SI-NEXT: v_readlane_b32 s13, v61, 26 +; SI-NEXT: v_readlane_b32 s14, v61, 25 +; SI-NEXT: v_readlane_b32 s26, v61, 28 +; SI-NEXT: v_readlane_b32 s27, v61, 29 +; SI-NEXT: v_readlane_b32 s28, v61, 27 +; SI-NEXT: v_readlane_b32 s40, v61, 31 +; SI-NEXT: v_readlane_b32 s42, v61, 30 +; SI-NEXT: v_readlane_b32 s43, v61, 33 +; SI-NEXT: v_readlane_b32 s44, v61, 32 +; SI-NEXT: v_readlane_b32 s46, v61, 35 +; SI-NEXT: v_readlane_b32 s56, v61, 34 +; SI-NEXT: v_readlane_b32 s57, v61, 36 +; SI-NEXT: v_readlane_b32 s79, v61, 37 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB93_3: ; %Flow +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB93_5 +; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s78, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 @@ -183261,6 +183380,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s60, s39, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: s_mov_b32 s20, s61 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s61, s48, 8 ; SI-NEXT: s_or_b32 s61, s61, vcc_hi @@ -183293,228 +183414,209 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s76, s84, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s51, 3 -; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_add_i32 s89, s93, 3 +; SI-NEXT: s_add_i32 s93, s34, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s77, s86, 8 -; SI-NEXT: s_add_i32 s89, s34, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff ; SI-NEXT: s_lshl_b32 s78, s94, 8 -; SI-NEXT: s_add_i32 s34, s66, 3 +; SI-NEXT: s_add_i32 s34, s53, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 ; SI-NEXT: s_or_b32 s22, s78, s93 ; SI-NEXT: s_and_b32 s93, s34, 0xff ; SI-NEXT: s_lshl_b32 s92, s16, 8 -; SI-NEXT: s_add_i32 s53, s68, 3 +; SI-NEXT: s_add_i32 s53, s66, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 ; SI-NEXT: s_or_b32 s92, s92, s93 ; SI-NEXT: s_and_b32 s93, s53, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 -; SI-NEXT: s_add_i32 s66, s69, 3 +; SI-NEXT: s_add_i32 s66, s68, 3 ; SI-NEXT: s_or_b32 s93, vcc_hi, s93 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff ; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_add_i32 s68, s8, 3 ; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi ; SI-NEXT: s_and_b32 s34, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s40, 8 +; SI-NEXT: s_lshl_b32 s39, s6, 8 ; SI-NEXT: s_add_i32 s69, s81, 3 ; SI-NEXT: s_or_b32 s34, s39, s34 ; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s21, 8 -; SI-NEXT: s_add_i32 s81, s7, 3 +; SI-NEXT: s_lshl_b32 s52, s85, 8 +; SI-NEXT: s_add_i32 s81, s21, 3 ; SI-NEXT: s_or_b32 s39, s52, s39 ; SI-NEXT: s_and_b32 s52, s81, 0xff ; SI-NEXT: s_lshl_b32 s53, s97, 8 -; SI-NEXT: s_add_i32 s85, s12, 3 +; SI-NEXT: s_add_i32 s85, s25, 3 ; SI-NEXT: s_or_b32 s52, s53, s52 ; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s11, 8 -; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_lshl_b32 s64, s7, 8 +; SI-NEXT: s_add_i32 s97, s12, 3 ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff -; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_lshl_b32 s66, s41, 8 +; SI-NEXT: s_add_i32 s21, s47, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 12 +; SI-NEXT: s_mov_b32 s91, s24 ; SI-NEXT: s_or_b32 s64, s66, s64 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_lshl_b32 s66, s59, 8 -; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_lshl_b32 s66, s58, 8 +; SI-NEXT: s_add_i32 s25, s59, 3 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 11 ; SI-NEXT: s_or_b32 s66, s66, s21 ; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_lshl_b32 s6, s9, 8 +; SI-NEXT: s_add_i32 s29, s14, 3 +; SI-NEXT: s_add_i32 s7, s27, 3 +; SI-NEXT: s_add_i32 s27, s20, 3 +; SI-NEXT: s_add_i32 s20, s16, 3 ; SI-NEXT: s_or_b32 s67, s6, s21 ; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 -; SI-NEXT: s_or_b32 s68, s18, s6 -; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 -; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 -; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 -; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 -; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 -; SI-NEXT: s_and_b32 s6, s11, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: s_lshl_b32 s18, s11, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s17, s16, 8 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s68, s18, s6 +; SI-NEXT: s_and_b32 s6, s28, 0xff +; SI-NEXT: s_lshl_b32 s18, s13, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s20, v62, 9 +; SI-NEXT: s_or_b32 s69, s18, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s26, 8 +; SI-NEXT: s_add_i32 s11, s40, 3 +; SI-NEXT: s_or_b32 s19, s19, s24 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 -; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s24, v62, 8 +; SI-NEXT: s_or_b32 s70, s7, s6 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s7, s42, 8 +; SI-NEXT: s_add_i32 s12, s43, 3 ; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 -; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_or_b32 s19, s19, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 -; SI-NEXT: s_and_b32 s6, s14, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_or_b32 s71, s7, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_add_i32 s14, s46, 3 ; SI-NEXT: s_and_b32 s27, s27, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: v_readlane_b32 s24, v62, 7 +; SI-NEXT: s_or_b32 s81, s7, s6 +; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_lshl_b32 s7, s56, 8 +; SI-NEXT: s_add_i32 s41, s79, 3 ; SI-NEXT: s_or_b32 s23, s23, s27 ; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: v_readlane_b32 s27, v62, 6 +; SI-NEXT: s_or_b32 s83, s7, s6 ; SI-NEXT: s_and_b32 s6, s41, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshl_b32 s7, s57, 8 ; SI-NEXT: s_and_b32 s24, s86, 0xff ; SI-NEXT: s_lshl_b32 s27, s27, 8 ; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: s_add_i32 s46, s88, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 13 ; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 -; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: v_readlane_b32 s27, v62, 5 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s82, s27, 3 +; SI-NEXT: v_readlane_b32 s73, v62, 4 +; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_and_b32 s27, s82, 0xff ; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: s_or_b32 s27, s73, s27 +; SI-NEXT: v_readlane_b32 s73, v62, 3 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s65, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v62, 2 +; SI-NEXT: s_or_b32 s97, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 15 ; SI-NEXT: s_and_b32 s73, s65, 0xff ; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v62, 1 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s54, s74, 3 +; SI-NEXT: v_readlane_b32 s75, v62, 0 +; SI-NEXT: s_or_b32 s63, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 18 ; SI-NEXT: s_and_b32 s74, s54, 0xff ; SI-NEXT: s_lshl_b32 s75, s75, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 -; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_or_b32 s74, s75, s74 +; SI-NEXT: v_readlane_b32 s75, v61, 63 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s15, s76, 0x300 +; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 62 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 20 ; SI-NEXT: s_and_b32 s75, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 -; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 -; SI-NEXT: s_add_i32 s18, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: v_readlane_b32 s7, v62, 23 +; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: v_readlane_b32 s76, v61, 61 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: s_add_i32 s48, s76, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 60 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: v_readlane_b32 s7, v62, 22 +; SI-NEXT: s_or_b32 s76, s77, s76 +; SI-NEXT: v_readlane_b32 s77, v61, 59 +; SI-NEXT: s_and_b32 s6, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s72, s79, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s79, v61, 58 +; SI-NEXT: s_or_b32 s88, s7, s6 ; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_add_i32 s35, s38, 3 ; SI-NEXT: s_add_i32 s21, s89, 0x300 ; SI-NEXT: s_add_i32 s89, s88, 0x300 -; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: s_or_b32 s77, s79, s77 ; SI-NEXT: s_and_b32 s79, s35, 0xff -; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_lshl_b32 s88, s36, 8 ; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: v_readlane_b32 s88, v61, 57 ; SI-NEXT: s_add_i32 s25, s92, 0x300 ; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: v_readlane_b32 s92, v61, 56 ; SI-NEXT: s_and_b32 s88, s30, 0xff ; SI-NEXT: s_lshl_b32 s92, s92, 8 ; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 +; SI-NEXT: v_readlane_b32 s92, v61, 55 ; SI-NEXT: s_add_i32 s94, s92, 3 ; SI-NEXT: s_and_b32 s92, s94, 0xff ; SI-NEXT: s_lshl_b32 s91, s91, 8 @@ -183523,52 +183625,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 s90, s90, 0xff ; SI-NEXT: s_lshl_b32 s92, s31, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: v_readlane_b32 s92, v61, 54 ; SI-NEXT: s_add_i32 s92, s92, 3 ; SI-NEXT: s_add_i32 s26, s93, 0x300 ; SI-NEXT: s_and_b32 s92, s92, 0xff ; SI-NEXT: s_lshl_b32 s93, s95, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 53 ; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff ; SI-NEXT: s_lshl_b32 s94, s55, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 51 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_and_b32 s94, s94, 0xff ; SI-NEXT: s_lshl_b32 s95, s99, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 -; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: v_readlane_b32 s95, v61, 52 ; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 0 +; SI-NEXT: v_readlane_b32 s30, v61, 50 ; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: v_readlane_b32 s30, v61, 49 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: v_readlane_b32 s30, v61, 48 ; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: v_readlane_b32 s30, v61, 47 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: v_readlane_b32 s30, v61, 46 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: v_readlane_b32 s30, v61, 45 ; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: v_readlane_b32 s31, v61, 44 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: v_readlane_b32 s31, v61, 43 ; SI-NEXT: s_add_i32 s29, s34, 0x300 ; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: v_readlane_b32 s34, v61, 42 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 @@ -183576,25 +183678,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 ; SI-NEXT: s_addk_i32 s30, 0x300 ; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: v_readlane_b32 s34, v61, 41 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 ; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: v_readlane_b32 s35, v61, 40 ; SI-NEXT: s_and_b32 s34, s34, 0xff ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: v_readlane_b32 s35, v61, 39 ; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: v_readlane_b32 s36, v61, 38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v1, s95 ; SI-NEXT: s_and_b32 s35, s35, 0xff ; SI-NEXT: s_lshl_b32 s36, s36, 8 ; SI-NEXT: s_or_b32 s35, s36, s35 @@ -183641,13 +183743,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_addk_i32 s92, 0x300 ; SI-NEXT: s_addk_i32 s93, 0x300 ; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_addk_i32 s34, 0x300 ; SI-NEXT: s_addk_i32 s35, 0x300 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, vcc_lo ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 @@ -183706,7 +183808,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: .LBB93_5: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -183761,22 +183863,22 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -183993,134 +184095,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s66, v61, 33 -; SI-NEXT: v_readlane_b32 s68, v61, 34 -; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: ; VI: ; %bb.0: @@ -190943,24 +190917,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -190981,10 +190937,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr63 @@ -190992,7 +190947,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -191001,13 +190955,33 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -191015,9 +190989,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -191041,7 +191018,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(43) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -191100,180 +191077,195 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -191283,9 +191275,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 ; GFX9-NEXT: .LBB94_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v43, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v40 +; GFX9-NEXT: v_mov_b32_e32 v40, v55 +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 @@ -191298,12 +191294,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] @@ -191335,164 +191355,149 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -191509,41 +191514,50 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191552,84 +191566,103 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -191639,16 +191672,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -191658,23 +191691,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 @@ -191682,14 +191708,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -191698,11 +191728,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191711,10 +191741,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191724,11 +191754,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191737,10 +191767,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191750,11 +191780,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191763,10 +191793,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191776,11 +191806,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191789,10 +191819,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191802,53 +191832,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -204812,36 +204823,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: v_writelane_b32 v41, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s39, v26 +; SI-NEXT: v_readfirstlane_b32 s56, v12 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s47, v12 -; SI-NEXT: v_writelane_b32 v42, s39, 0 -; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_writelane_b32 v42, s47, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v24 -; SI-NEXT: v_writelane_b32 v42, s56, 2 -; SI-NEXT: v_readfirstlane_b32 s49, v23 -; SI-NEXT: v_writelane_b32 v42, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v21 -; SI-NEXT: v_writelane_b32 v42, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v22 -; SI-NEXT: v_writelane_b32 v42, s50, 5 -; SI-NEXT: v_writelane_b32 v42, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s57, v20 -; SI-NEXT: v_readfirstlane_b32 s58, v19 -; SI-NEXT: v_readfirstlane_b32 s64, v29 -; SI-NEXT: v_readfirstlane_b32 s65, v30 -; SI-NEXT: v_readfirstlane_b32 s59, v28 -; SI-NEXT: v_readfirstlane_b32 s60, v27 -; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s57, v11 +; SI-NEXT: v_writelane_b32 v42, s56, 0 +; SI-NEXT: v_readfirstlane_b32 s50, v24 +; SI-NEXT: v_writelane_b32 v42, s57, 1 +; SI-NEXT: v_readfirstlane_b32 s51, v23 +; SI-NEXT: v_writelane_b32 v42, s50, 2 +; SI-NEXT: v_readfirstlane_b32 s52, v21 +; SI-NEXT: v_writelane_b32 v42, s51, 3 +; SI-NEXT: v_readfirstlane_b32 s53, v22 +; SI-NEXT: v_writelane_b32 v42, s52, 4 +; SI-NEXT: v_writelane_b32 v42, s53, 5 +; SI-NEXT: v_readfirstlane_b32 s58, v20 +; SI-NEXT: v_readfirstlane_b32 s59, v19 +; SI-NEXT: v_readfirstlane_b32 s67, v29 +; SI-NEXT: v_readfirstlane_b32 s71, v30 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s60, v28 +; SI-NEXT: v_readfirstlane_b32 s61, v27 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v9 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 14 @@ -204863,48 +204874,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: v_readfirstlane_b32 s90, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_readfirstlane_b32 s24, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v38 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s40, v17 -; SI-NEXT: v_readfirstlane_b32 s41, v18 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v16 -; SI-NEXT: v_readfirstlane_b32 s77, v15 -; SI-NEXT: v_readfirstlane_b32 s38, v25 +; SI-NEXT: v_readfirstlane_b32 s9, v38 +; SI-NEXT: v_readfirstlane_b32 s22, v10 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s42, v5 +; SI-NEXT: v_readfirstlane_b32 s43, v6 +; SI-NEXT: v_readfirstlane_b32 s76, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v4 +; SI-NEXT: v_readfirstlane_b32 s47, v3 +; SI-NEXT: v_readfirstlane_b32 s38, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v14 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: v_readfirstlane_b32 s49, v26 ; SI-NEXT: v_writelane_b32 v41, s99, 35 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: v_readfirstlane_b32 s95, v55 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s95, v40 +; SI-NEXT: v_readfirstlane_b32 s93, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_writelane_b32 v43, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: v_writelane_b32 v43, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: v_writelane_b32 v43, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_writelane_b32 v43, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_writelane_b32 v43, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -204915,37 +204926,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: v_readfirstlane_b32 s8, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s8, v33 +; SI-NEXT: v_readfirstlane_b32 s11, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_writelane_b32 v43, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_writelane_b32 v43, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s89, v38 +; SI-NEXT: v_readfirstlane_b32 s70, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s78, v39 +; SI-NEXT: v_readfirstlane_b32 s7, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s7, v48 +; SI-NEXT: v_readfirstlane_b32 s10, v48 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s82, v49 ; SI-NEXT: s_waitcnt vmcnt(7) @@ -204959,39 +204969,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: v_readfirstlane_b32 s69, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: v_writelane_b32 v43, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_readfirstlane_b32 s92, v32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s18, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s9, v35 +; SI-NEXT: v_readfirstlane_b32 s12, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s13, v36 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_readfirstlane_b32 s10, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: v_writelane_b32 v43, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s90, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_readfirstlane_b32 s91, v39 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v48 +; SI-NEXT: v_readfirstlane_b32 s68, v48 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: v_readfirstlane_b32 s37, v49 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: v_readfirstlane_b32 s84, v50 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: v_readfirstlane_b32 s6, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -205007,50 +205014,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 37 +; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 38 +; SI-NEXT: v_writelane_b32 v43, s4, 34 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 39 +; SI-NEXT: v_writelane_b32 v43, s4, 35 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 40 -; SI-NEXT: v_writelane_b32 v43, s44, 41 -; SI-NEXT: v_writelane_b32 v43, s6, 42 -; SI-NEXT: v_writelane_b32 v43, s7, 43 -; SI-NEXT: v_writelane_b32 v43, s8, 44 -; SI-NEXT: v_writelane_b32 v43, s9, 45 -; SI-NEXT: v_writelane_b32 v43, s10, 46 -; SI-NEXT: v_writelane_b32 v43, s11, 47 -; SI-NEXT: v_writelane_b32 v43, s12, 48 -; SI-NEXT: v_writelane_b32 v43, s13, 49 -; SI-NEXT: v_writelane_b32 v43, s14, 50 -; SI-NEXT: v_writelane_b32 v43, s15, 51 -; SI-NEXT: v_writelane_b32 v43, s18, 52 -; SI-NEXT: v_writelane_b32 v43, s21, 53 -; SI-NEXT: v_writelane_b32 v43, s22, 54 -; SI-NEXT: v_writelane_b32 v43, s40, 55 -; SI-NEXT: v_writelane_b32 v43, s41, 56 -; SI-NEXT: v_writelane_b32 v43, s42, 57 -; SI-NEXT: v_writelane_b32 v43, s43, 58 -; SI-NEXT: v_writelane_b32 v43, s76, 59 -; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v43, s6, 37 +; SI-NEXT: v_writelane_b32 v43, s7, 38 +; SI-NEXT: v_writelane_b32 v43, s8, 39 +; SI-NEXT: v_writelane_b32 v43, s18, 40 +; SI-NEXT: v_writelane_b32 v43, s9, 41 +; SI-NEXT: v_writelane_b32 v43, s10, 42 +; SI-NEXT: v_writelane_b32 v43, s11, 43 +; SI-NEXT: v_writelane_b32 v43, s12, 44 +; SI-NEXT: v_writelane_b32 v43, s13, 45 +; SI-NEXT: v_writelane_b32 v43, s14, 46 +; SI-NEXT: v_writelane_b32 v43, s15, 47 +; SI-NEXT: v_writelane_b32 v43, s21, 48 +; SI-NEXT: v_writelane_b32 v43, s22, 49 +; SI-NEXT: v_writelane_b32 v43, s40, 50 +; SI-NEXT: v_writelane_b32 v43, s41, 51 +; SI-NEXT: v_writelane_b32 v43, s42, 52 +; SI-NEXT: v_writelane_b32 v43, s43, 53 +; SI-NEXT: v_writelane_b32 v43, s76, 54 +; SI-NEXT: v_writelane_b32 v43, s77, 55 +; SI-NEXT: v_writelane_b32 v43, s46, 56 +; SI-NEXT: v_writelane_b32 v43, s47, 57 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: v_readfirstlane_b32 s16, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s35, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s23, v35 -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: v_readfirstlane_b32 s19, v35 +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s26, v36 +; SI-NEXT: v_readfirstlane_b32 s87, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s88, v37 +; SI-NEXT: v_readfirstlane_b32 s79, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s79, v38 +; SI-NEXT: v_readfirstlane_b32 s27, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s25, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 @@ -205063,39 +205071,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v49 +; SI-NEXT: v_readfirstlane_b32 s88, v49 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s85, v50 +; SI-NEXT: v_readfirstlane_b32 s86, v50 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s66, v51 +; SI-NEXT: v_readfirstlane_b32 s34, v51 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62 -; SI-NEXT: v_writelane_b32 v43, s38, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v16 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v15 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 59 +; SI-NEXT: v_writelane_b32 v43, s38, 60 +; SI-NEXT: v_writelane_b32 v43, s39, 61 +; SI-NEXT: v_writelane_b32 v43, s48, 62 +; SI-NEXT: v_writelane_b32 v43, s49, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s20, v31 +; SI-NEXT: v_readfirstlane_b32 s17, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s19, v32 +; SI-NEXT: v_readfirstlane_b32 s20, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s27, v33 +; SI-NEXT: v_readfirstlane_b32 s94, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: v_readfirstlane_b32 s26, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s72, v35 +; SI-NEXT: v_readfirstlane_b32 s73, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s73, v36 +; SI-NEXT: v_readfirstlane_b32 s74, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s67, v37 +; SI-NEXT: v_readfirstlane_b32 s80, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s71, v38 +; SI-NEXT: v_readfirstlane_b32 s81, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s97, v39 +; SI-NEXT: v_readfirstlane_b32 s36, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -205105,141 +205116,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s35, v48 +; SI-NEXT: v_readfirstlane_b32 s31, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s23, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s87, v50 +; SI-NEXT: v_readfirstlane_b32 s83, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s63, v51 +; SI-NEXT: v_readfirstlane_b32 s72, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s81, v32 +; SI-NEXT: v_readfirstlane_b32 s78, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s80, v33 +; SI-NEXT: v_readfirstlane_b32 s97, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s86, v34 +; SI-NEXT: v_readfirstlane_b32 s98, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s34, v35 +; SI-NEXT: v_readfirstlane_b32 s30, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: v_readfirstlane_b32 s85, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s31, v37 +; SI-NEXT: v_readfirstlane_b32 s66, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s61, v38 +; SI-NEXT: v_readfirstlane_b32 s62, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_readfirstlane_b32 s63, v39 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s53, v48 +; SI-NEXT: v_readfirstlane_b32 s55, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s52, v49 -; SI-NEXT: v_writelane_b32 v42, s52, 7 -; SI-NEXT: v_writelane_b32 v42, s53, 8 -; SI-NEXT: v_writelane_b32 v42, s57, 9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s54, v50 -; SI-NEXT: v_writelane_b32 v42, s58, 10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v51 -; SI-NEXT: v_writelane_b32 v42, s54, 11 -; SI-NEXT: v_writelane_b32 v42, s55, 12 -; SI-NEXT: v_writelane_b32 v42, s64, 13 -; SI-NEXT: v_writelane_b32 v42, s65, 14 -; SI-NEXT: v_writelane_b32 v42, s67, 15 -; SI-NEXT: v_writelane_b32 v42, s71, 16 -; SI-NEXT: v_writelane_b32 v42, s80, 17 -; SI-NEXT: v_writelane_b32 v42, s81, 18 -; SI-NEXT: v_writelane_b32 v42, s59, 19 -; SI-NEXT: v_writelane_b32 v42, s60, 20 -; SI-NEXT: v_writelane_b32 v42, s86, 21 -; SI-NEXT: v_writelane_b32 v42, s97, 22 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_writelane_b32 v42, s54, 6 +; SI-NEXT: v_writelane_b32 v42, s55, 7 +; SI-NEXT: v_writelane_b32 v42, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s64, v50 +; SI-NEXT: v_writelane_b32 v42, s59, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s65, v51 +; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: v_writelane_b32 v42, s67, 12 +; SI-NEXT: v_writelane_b32 v42, s71, 13 +; SI-NEXT: v_writelane_b32 v42, s80, 14 +; SI-NEXT: v_writelane_b32 v42, s81, 15 +; SI-NEXT: v_writelane_b32 v42, s97, 16 +; SI-NEXT: v_writelane_b32 v42, s78, 17 +; SI-NEXT: v_writelane_b32 v42, s60, 18 +; SI-NEXT: v_writelane_b32 v42, s61, 19 +; SI-NEXT: v_writelane_b32 v42, s98, 20 +; SI-NEXT: v_writelane_b32 v42, s36, 21 +; SI-NEXT: v_writelane_b32 v42, s30, 22 ; SI-NEXT: v_writelane_b32 v42, s34, 23 -; SI-NEXT: v_writelane_b32 v42, s66, 24 -; SI-NEXT: v_writelane_b32 v42, s85, 25 -; SI-NEXT: v_writelane_b32 v42, s31, 26 -; SI-NEXT: v_writelane_b32 v42, s84, 27 +; SI-NEXT: v_writelane_b32 v42, s86, 24 +; SI-NEXT: v_writelane_b32 v42, s66, 25 +; SI-NEXT: v_writelane_b32 v42, s85, 26 +; SI-NEXT: v_writelane_b32 v42, s31, 27 ; SI-NEXT: v_writelane_b32 v42, s35, 28 -; SI-NEXT: v_writelane_b32 v42, s98, 29 +; SI-NEXT: v_writelane_b32 v42, s16, 29 ; SI-NEXT: v_writelane_b32 v42, s17, 30 -; SI-NEXT: v_writelane_b32 v42, s20, 31 -; SI-NEXT: v_writelane_b32 v42, s61, 32 -; SI-NEXT: v_writelane_b32 v42, s19, 33 -; SI-NEXT: v_writelane_b32 v42, s62, 34 +; SI-NEXT: v_writelane_b32 v42, s62, 31 +; SI-NEXT: v_writelane_b32 v42, s20, 32 +; SI-NEXT: v_writelane_b32 v42, s63, 33 +; SI-NEXT: v_writelane_b32 v42, s19, 34 ; SI-NEXT: v_writelane_b32 v42, s23, 35 ; SI-NEXT: v_writelane_b32 v42, s83, 36 ; SI-NEXT: v_writelane_b32 v42, s87, 37 ; SI-NEXT: v_writelane_b32 v42, s26, 38 ; SI-NEXT: v_writelane_b32 v42, s94, 39 -; SI-NEXT: v_writelane_b32 v42, s27, 40 -; SI-NEXT: v_writelane_b32 v42, s63, 41 +; SI-NEXT: v_writelane_b32 v42, s72, 40 +; SI-NEXT: v_writelane_b32 v42, s27, 41 ; SI-NEXT: v_writelane_b32 v42, s79, 42 -; SI-NEXT: v_writelane_b32 v42, s88, 43 -; SI-NEXT: v_writelane_b32 v42, s72, 44 -; SI-NEXT: v_writelane_b32 v42, s73, 45 -; SI-NEXT: v_writelane_b32 v42, s74, 46 -; SI-NEXT: v_writelane_b32 v42, s75, 47 -; SI-NEXT: v_writelane_b32 v42, s24, 48 -; SI-NEXT: v_writelane_b32 v42, s25, 49 -; SI-NEXT: v_writelane_b32 v42, s28, 50 +; SI-NEXT: v_writelane_b32 v42, s73, 43 +; SI-NEXT: v_writelane_b32 v42, s74, 44 +; SI-NEXT: v_writelane_b32 v42, s75, 45 +; SI-NEXT: v_writelane_b32 v42, s25, 46 +; SI-NEXT: v_writelane_b32 v42, s88, 47 +; SI-NEXT: v_writelane_b32 v42, s28, 48 +; SI-NEXT: v_writelane_b32 v42, s29, 49 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_readlane_b32 s4, v43, 13 ; SI-NEXT: v_readlane_b32 s5, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s29, s4, s5 +; SI-NEXT: s_or_b32 s44, s4, s5 ; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 ; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: s_and_b32 s4, s62, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s58, s4, s5 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 -; SI-NEXT: s_or_b32 s59, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_and_b32 s4, s75, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s60, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s61, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_and_b32 s4, s88, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s73, s4, s5 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_or_b32 s75, s4, s5 ; SI-NEXT: v_readlane_b32 s4, v43, 9 ; SI-NEXT: v_readlane_b32 s5, v43, 8 @@ -205258,7 +205269,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: v_writelane_b32 v42, s7, 52 ; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_readlane_b32 s6, v43, 1 @@ -205266,345 +205277,340 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s7, s6, s7 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s12, 24 -; SI-NEXT: s_or_b32 s37, s8, s6 +; SI-NEXT: s_lshl_b32 s8, s15, 24 +; SI-NEXT: s_or_b32 s9, s8, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: v_readlane_b32 s8, v43, 2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_writelane_b32 v42, s9, 53 ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s15, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 ; SI-NEXT: s_or_b32 s9, s8, s9 -; SI-NEXT: s_and_b32 s8, s13, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s14, 24 -; SI-NEXT: s_or_b32 s68, s10, s8 ; SI-NEXT: s_and_b32 s8, s21, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s10, s22, 24 +; SI-NEXT: s_or_b32 s11, s10, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_writelane_b32 v42, s11, 54 ; SI-NEXT: s_or_b32 s8, s10, s8 -; SI-NEXT: s_and_b32 s10, s77, 0xff -; SI-NEXT: s_lshl_b32 s11, s76, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s11, vcc_lo, 8 ; SI-NEXT: s_or_b32 s11, s10, s11 -; SI-NEXT: s_and_b32 s10, s40, 0xff +; SI-NEXT: s_and_b32 s10, s76, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, s41, 24 -; SI-NEXT: s_or_b32 s99, s12, s10 -; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s12, s77, 24 +; SI-NEXT: s_or_b32 s13, s12, s10 +; SI-NEXT: s_and_b32 s10, s38, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24 +; SI-NEXT: s_lshl_b32 s12, s39, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v42, s13, 55 ; SI-NEXT: s_or_b32 s10, s12, s10 -; SI-NEXT: s_and_b32 s12, s49, 0xff -; SI-NEXT: s_lshl_b32 s13, s48, 8 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s12, s51, 0xff +; SI-NEXT: s_lshl_b32 s13, s50, 8 ; SI-NEXT: s_or_b32 s13, s12, s13 -; SI-NEXT: s_and_b32 s12, s38, 0xff +; SI-NEXT: s_and_b32 s12, s48, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 -; SI-NEXT: s_or_b32 s92, s14, s12 -; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s14, s49, 24 +; SI-NEXT: s_or_b32 s99, s14, s12 +; SI-NEXT: s_and_b32 s12, s52, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s51, 24 +; SI-NEXT: s_lshl_b32 s14, s53, 24 ; SI-NEXT: s_or_b32 s12, s14, s12 -; SI-NEXT: s_and_b32 s14, s55, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: s_and_b32 s14, s65, 0xff +; SI-NEXT: s_lshl_b32 s15, s64, 8 ; SI-NEXT: s_or_b32 s15, s14, s15 -; SI-NEXT: s_and_b32 s14, s52, 0xff +; SI-NEXT: s_and_b32 s14, s54, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s24, s53, 24 -; SI-NEXT: s_mov_b32 s28, s90 -; SI-NEXT: s_or_b32 s90, s24, s14 -; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: v_writelane_b32 v42, s24, 56 +; SI-NEXT: s_lshl_b32 s24, s55, 24 +; SI-NEXT: s_or_b32 s24, s24, s14 +; SI-NEXT: s_and_b32 s14, s67, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s25, s65, 24 +; SI-NEXT: s_lshl_b32 s25, s71, 24 ; SI-NEXT: s_or_b32 s14, s25, s14 -; SI-NEXT: s_and_b32 s25, s34, 0xff -; SI-NEXT: s_lshl_b32 s40, s86, 8 +; SI-NEXT: s_and_b32 s25, s30, 0xff +; SI-NEXT: s_lshl_b32 s40, s98, 8 ; SI-NEXT: s_or_b32 s41, s25, s40 -; SI-NEXT: s_and_b32 s25, s80, 0xff +; SI-NEXT: s_and_b32 s25, s97, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_lshl_b32 s40, s81, 24 -; SI-NEXT: s_or_b32 s18, s40, s25 -; SI-NEXT: s_and_b32 s40, s31, 0xff +; SI-NEXT: s_lshl_b32 s40, s78, 24 +; SI-NEXT: s_or_b32 s25, s40, s25 +; SI-NEXT: s_and_b32 s40, s66, 0xff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_lshl_b32 s42, s84, 24 +; SI-NEXT: s_lshl_b32 s42, s85, 24 ; SI-NEXT: s_or_b32 s40, s42, s40 -; SI-NEXT: s_and_b32 s42, s35, 0xff -; SI-NEXT: s_lshl_b32 s43, s97, 8 +; SI-NEXT: s_and_b32 s42, s31, 0xff +; SI-NEXT: s_lshl_b32 s43, s36, 8 ; SI-NEXT: s_or_b32 s43, s42, s43 -; SI-NEXT: s_and_b32 s42, s71, 0xff +; SI-NEXT: s_and_b32 s42, s81, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s67, 24 -; SI-NEXT: s_or_b32 s35, s76, s42 -; SI-NEXT: s_and_b32 s42, s87, 0xff +; SI-NEXT: s_lshl_b32 s76, s80, 24 +; SI-NEXT: s_or_b32 s21, s76, s42 +; SI-NEXT: s_and_b32 s42, s83, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 24 +; SI-NEXT: s_lshl_b32 s76, s23, 24 ; SI-NEXT: s_or_b32 s42, s76, s42 -; SI-NEXT: s_and_b32 s76, s19, 0xff -; SI-NEXT: s_lshl_b32 s77, s20, 8 +; SI-NEXT: s_and_b32 s76, s20, 0xff +; SI-NEXT: s_lshl_b32 s77, s17, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s66, 0xff -; SI-NEXT: v_writelane_b32 v42, s78, 52 +; SI-NEXT: s_and_b32 s77, s34, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s85, 24 -; SI-NEXT: s_or_b32 s19, s78, s77 -; SI-NEXT: s_and_b32 s77, s94, 0xff +; SI-NEXT: s_lshl_b32 s78, s86, 24 +; SI-NEXT: s_or_b32 s17, s78, s77 +; SI-NEXT: s_and_b32 s77, s26, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s27, 24 +; SI-NEXT: s_lshl_b32 s78, s94, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff ; SI-NEXT: s_or_b32 vcc_lo, s78, s77 -; SI-NEXT: s_or_b32 vcc_hi, s76, s19 -; SI-NEXT: s_and_b32 s76, s26, 0xff -; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 vcc_hi, s76, s17 +; SI-NEXT: s_and_b32 s76, s87, 0xff +; SI-NEXT: s_lshl_b32 s77, s19, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s98, 0xff +; SI-NEXT: s_and_b32 s77, s35, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_or_b32 s71, s78, s77 -; SI-NEXT: s_and_b32 s77, s79, 0xff +; SI-NEXT: s_and_b32 s77, s27, 0xff ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 40 -; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 36 +; SI-NEXT: s_and_b32 s43, s43, 0xffff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_lshl_b32 s78, s79, 24 ; SI-NEXT: s_or_b32 s39, s76, s71 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 39 -; SI-NEXT: s_or_b32 s41, s41, s18 -; SI-NEXT: s_mov_b32 s31, s18 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: s_or_b32 s43, s43, s21 +; SI-NEXT: s_mov_b32 s23, s21 ; SI-NEXT: s_or_b32 s38, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 38 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s21, v43, 34 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 37 +; SI-NEXT: s_and_b32 s77, s21, 0xff +; SI-NEXT: v_readlane_b32 s21, v43, 33 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s21, 24 ; SI-NEXT: s_or_b32 s80, s78, s77 -; SI-NEXT: s_and_b32 s77, s95, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 36 +; SI-NEXT: s_and_b32 s77, s93, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s93, 24 -; SI-NEXT: s_or_b32 s49, s76, s80 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_lshl_b32 s78, s95, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff ; SI-NEXT: s_or_b32 s48, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 34 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s91, 0xff +; SI-NEXT: s_lshl_b32 s77, s90, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 32 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 33 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 31 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_or_b32 s81, s78, s77 -; SI-NEXT: s_and_b32 s77, s30, 0xff +; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s69, 24 +; SI-NEXT: s_lshl_b32 s78, s68, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 31 ; SI-NEXT: s_or_b32 s50, s78, s77 ; SI-NEXT: s_or_b32 s51, s76, s81 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_and_b32 s76, s92, 0xff ; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 30 +; SI-NEXT: v_readlane_b32 s16, v43, 30 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: s_and_b32 s77, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 ; SI-NEXT: s_lshl_b32 s78, s82, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 53 -; SI-NEXT: v_readlane_b32 s18, v43, 32 -; SI-NEXT: v_writelane_b32 v42, s82, 54 +; SI-NEXT: v_writelane_b32 v42, s96, 57 +; SI-NEXT: v_writelane_b32 v42, s82, 58 ; SI-NEXT: s_or_b32 s82, s78, s77 ; SI-NEXT: s_and_b32 s77, s18, 0xff ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 28 +; SI-NEXT: v_readlane_b32 s16, v43, 29 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s70, 24 +; SI-NEXT: s_lshl_b32 s78, s69, 24 ; SI-NEXT: s_or_b32 s53, s76, s82 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 27 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 28 ; SI-NEXT: s_or_b32 s52, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 26 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 27 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 25 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 26 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_writelane_b32 v42, s16, 55 -; SI-NEXT: s_or_b32 s16, s78, s77 -; SI-NEXT: s_and_b32 s77, s89, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 29 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s84, 59 +; SI-NEXT: s_or_b32 s84, s78, s77 +; SI-NEXT: s_and_b32 s77, s70, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s89, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 22 -; SI-NEXT: v_readlane_b32 s18, v43, 21 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: v_readlane_b32 s18, v43, 22 ; SI-NEXT: s_or_b32 s54, s78, s77 -; SI-NEXT: s_or_b32 s55, s76, s16 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_or_b32 s55, s76, s84 +; SI-NEXT: s_and_b32 s76, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 20 +; SI-NEXT: v_readlane_b32 s16, v43, 21 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 19 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 20 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_readlane_b32 s17, v43, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 25 ; SI-NEXT: s_or_b32 s83, s78, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 23 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 17 +; SI-NEXT: v_readlane_b32 s16, v43, 17 ; SI-NEXT: v_readlane_b32 s18, v43, 16 ; SI-NEXT: s_or_b32 s64, s78, s77 ; SI-NEXT: s_or_b32 s65, s76, s83 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_and_b32 s76, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s18, 8 ; SI-NEXT: v_readlane_b32 s18, v43, 15 ; SI-NEXT: s_or_b32 s76, s76, s77 ; SI-NEXT: s_and_b32 s77, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v43, 14 -; SI-NEXT: v_writelane_b32 v42, s89, 56 ; SI-NEXT: s_lshl_b32 s77, s77, 16 ; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: v_writelane_b32 v42, s70, 57 +; SI-NEXT: v_readlane_b32 s16, v43, 19 ; SI-NEXT: s_or_b32 s85, s78, s77 -; SI-NEXT: s_and_b32 s77, s44, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 18 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_and_b32 s43, s43, 0xffff -; SI-NEXT: v_writelane_b32 v42, s69, 58 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 +; SI-NEXT: v_writelane_b32 v42, s89, 60 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: s_and_b32 s44, s29, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s37 -; SI-NEXT: s_or_b32 s9, s9, s68 -; SI-NEXT: s_or_b32 s11, s11, s99 -; SI-NEXT: s_or_b32 s13, s13, s92 -; SI-NEXT: s_or_b32 s15, s15, s90 -; SI-NEXT: s_or_b32 s43, s43, s35 -; SI-NEXT: v_writelane_b32 v42, s30, 59 -; SI-NEXT: s_mov_b32 s23, s91 -; SI-NEXT: s_mov_b32 s91, s36 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v42, s70, 61 ; SI-NEXT: s_or_b32 s66, s78, s77 ; SI-NEXT: s_or_b32 s67, s76, s85 -; SI-NEXT: s_and_b32 s45, s45, 0xffff -; SI-NEXT: s_and_b32 s46, s46, 0xffff -; SI-NEXT: s_and_b32 s47, s47, 0xffff -; SI-NEXT: s_and_b32 s56, s56, 0xffff -; SI-NEXT: s_and_b32 s57, s57, 0xffff -; SI-NEXT: s_and_b32 s30, s58, 0xffff -; SI-NEXT: s_and_b32 s34, s59, 0xffff -; SI-NEXT: s_and_b32 s36, s60, 0xffff -; SI-NEXT: s_and_b32 s97, s61, 0xffff -; SI-NEXT: s_and_b32 s86, s62, 0xffff -; SI-NEXT: s_and_b32 s98, s63, 0xffff -; SI-NEXT: s_and_b32 s17, s72, 0xffff -; SI-NEXT: s_and_b32 s87, s73, 0xffff ; SI-NEXT: s_and_b32 s96, s74, 0xffff ; SI-NEXT: s_and_b32 s22, s75, 0xffff ; SI-NEXT: s_or_b32 s74, s44, s4 ; SI-NEXT: s_mov_b32 s75, s5 ; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: s_mov_b32 s70, s93 -; SI-NEXT: s_mov_b32 s69, s95 -; SI-NEXT: s_mov_b32 s93, s28 -; SI-NEXT: s_or_b32 s72, s45, s6 -; SI-NEXT: s_mov_b32 s73, s7 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], vcc, 16 +; SI-NEXT: v_writelane_b32 v42, s4, 50 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s87, s73, 0xffff +; SI-NEXT: v_writelane_b32 v42, s5, 51 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_and_b32 s86, s62, 0xffff +; SI-NEXT: s_and_b32 s98, s63, 0xffff +; SI-NEXT: s_and_b32 s21, s72, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s8 ; SI-NEXT: s_mov_b32 s63, s9 ; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_or_b32 s8, s87, s54 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: v_readlane_b32 s16, v42, 52 +; SI-NEXT: s_or_b32 s13, s13, s99 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff +; SI-NEXT: s_and_b32 s97, s61, 0xffff ; SI-NEXT: s_or_b32 s60, s47, s10 ; SI-NEXT: s_mov_b32 s61, s11 -; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 16 +; SI-NEXT: s_or_b32 s10, s21, s52 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_lshr_b32 s55, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 53 +; SI-NEXT: s_or_b32 s15, s15, s24 +; SI-NEXT: s_or_b32 s41, s41, s25 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_and_b32 s30, s58, 0xffff +; SI-NEXT: s_and_b32 s34, s59, 0xffff ; SI-NEXT: s_or_b32 s58, s56, s12 ; SI-NEXT: s_mov_b32 s59, s13 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_or_b32 s12, s98, s50 +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_lshr_b32 s53, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 54 +; SI-NEXT: s_mov_b32 s70, s69 +; SI-NEXT: s_mov_b32 s69, s68 +; SI-NEXT: s_mov_b32 s68, s37 +; SI-NEXT: s_mov_b32 s37, s95 +; SI-NEXT: s_or_b32 s72, s45, s6 +; SI-NEXT: s_mov_b32 s73, s7 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 ; SI-NEXT: s_or_b32 s56, s57, s14 ; SI-NEXT: s_mov_b32 s57, s15 -; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 ; SI-NEXT: s_or_b32 s46, s30, s40 ; SI-NEXT: s_mov_b32 s47, s41 -; SI-NEXT: s_or_b32 s44, s34, s42 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_mov_b32 s45, s43 -; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 -; SI-NEXT: s_or_b32 s42, s36, vcc_lo -; SI-NEXT: s_mov_b32 s43, vcc_hi -; SI-NEXT: s_lshr_b64 vcc, vcc, 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16 ; SI-NEXT: s_or_b32 s40, s97, s38 ; SI-NEXT: s_mov_b32 s41, s39 ; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 ; SI-NEXT: s_or_b32 s14, s86, s48 ; SI-NEXT: s_mov_b32 s15, s49 ; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 -; SI-NEXT: s_or_b32 s12, s98, s50 -; SI-NEXT: s_mov_b32 s13, s51 -; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 -; SI-NEXT: s_or_b32 s10, s17, s52 -; SI-NEXT: s_mov_b32 s11, s53 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 -; SI-NEXT: s_or_b32 s8, s87, s54 -; SI-NEXT: s_mov_b32 s9, s55 -; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 ; SI-NEXT: s_or_b32 s6, s96, s64 ; SI-NEXT: s_mov_b32 s7, s65 ; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 ; SI-NEXT: s_or_b32 s4, s22, s66 ; SI-NEXT: s_mov_b32 s5, s67 ; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 -; SI-NEXT: v_readlane_b32 s17, v42, 51 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s37, 16 -; SI-NEXT: s_lshr_b32 s51, s68, 16 -; SI-NEXT: s_lshr_b32 s49, s99, 16 -; SI-NEXT: s_lshr_b32 s86, s92, 16 -; SI-NEXT: s_lshr_b32 s39, s90, 16 -; SI-NEXT: s_lshr_b32 s18, s31, 16 -; SI-NEXT: s_lshr_b32 s22, s35, 16 -; SI-NEXT: s_lshr_b32 s97, s19, 16 +; SI-NEXT: s_lshr_b32 s51, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 55 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b32 s49, s16, 16 +; SI-NEXT: s_lshr_b32 s86, s99, 16 +; SI-NEXT: s_lshr_b32 s39, s24, 16 +; SI-NEXT: s_lshr_b32 s18, s25, 16 +; SI-NEXT: s_lshr_b32 s22, s23, 16 +; SI-NEXT: s_lshr_b32 s97, s17, 16 ; SI-NEXT: s_lshr_b32 s65, s71, 16 -; SI-NEXT: s_lshr_b32 s19, s80, 16 -; SI-NEXT: s_lshr_b32 s71, s81, 16 +; SI-NEXT: s_lshr_b32 s71, s80, 16 +; SI-NEXT: s_lshr_b32 s21, s81, 16 ; SI-NEXT: s_lshr_b32 s67, s82, 16 -; SI-NEXT: v_readlane_b32 s82, v42, 54 -; SI-NEXT: v_readlane_b32 s96, v42, 53 -; SI-NEXT: s_lshr_b32 s80, s16, 16 -; SI-NEXT: v_readlane_b32 s16, v42, 55 +; SI-NEXT: v_readlane_b32 s82, v42, 58 +; SI-NEXT: v_readlane_b32 s96, v42, 57 +; SI-NEXT: s_lshr_b32 s80, s84, 16 +; SI-NEXT: v_readlane_b32 s84, v42, 59 ; SI-NEXT: s_lshr_b32 s81, s83, 16 -; SI-NEXT: s_mov_b32 s90, s93 -; SI-NEXT: v_readlane_b32 s78, v42, 52 -; SI-NEXT: s_mov_b32 s95, s69 -; SI-NEXT: s_mov_b32 s93, s70 -; SI-NEXT: v_readlane_b32 s30, v42, 59 -; SI-NEXT: v_readlane_b32 s69, v42, 58 -; SI-NEXT: v_readlane_b32 s70, v42, 57 -; SI-NEXT: v_readlane_b32 s89, v42, 56 +; SI-NEXT: s_mov_b32 s95, s37 +; SI-NEXT: s_mov_b32 s37, s68 +; SI-NEXT: s_mov_b32 s68, s69 +; SI-NEXT: s_mov_b32 s69, s70 +; SI-NEXT: v_readlane_b32 s70, v42, 61 +; SI-NEXT: v_readlane_b32 s89, v42, 60 ; SI-NEXT: s_lshr_b32 s77, s85, 16 -; SI-NEXT: s_mov_b32 s84, vcc_lo -; SI-NEXT: s_mov_b32 s36, s91 -; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: v_readlane_b32 s24, v42, 56 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: v_readlane_b32 s4, v43, 42 +; SI-NEXT: v_readlane_b32 s4, v43, 41 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 41 +; SI-NEXT: v_readlane_b32 s6, v43, 19 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readlane_b32 s5, v43, 18 @@ -205631,14 +205637,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 44 +; SI-NEXT: v_readlane_b32 s6, v43, 43 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: v_readlane_b32 s8, v43, 25 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -205646,15 +205653,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 21 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s8, v43, 22 +; SI-NEXT: v_readlane_b32 s9, v43, 21 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 19 +; SI-NEXT: v_readlane_b32 s8, v43, 20 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 @@ -205662,29 +205669,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 43 +; SI-NEXT: v_readlane_b32 s8, v43, 42 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_readlane_b32 s9, v43, 38 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_add_i32 s10, s89, 3 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_add_i32 s10, s70, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s89, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_addk_i32 s8, 0x300 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v43, 29 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_readlane_b32 s10, v43, 27 -; SI-NEXT: v_readlane_b32 s11, v43, 26 +; SI-NEXT: v_readlane_b32 s10, v43, 28 +; SI-NEXT: v_readlane_b32 s11, v43, 27 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 25 +; SI-NEXT: v_readlane_b32 s10, v43, 26 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 @@ -205692,23 +205699,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 46 +; SI-NEXT: v_readlane_b32 s10, v43, 45 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 45 -; SI-NEXT: v_readlane_b32 s12, v43, 32 +; SI-NEXT: v_readlane_b32 s11, v43, 44 +; SI-NEXT: v_readlane_b32 s12, v43, 40 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s70, 24 +; SI-NEXT: s_lshl_b32 s11, s69, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v43, 31 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s11, s92, 3 ; SI-NEXT: v_readlane_b32 s13, v43, 30 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s96, 8 @@ -205721,27 +205727,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_add_i32 s12, s36, 3 +; SI-NEXT: v_readlane_b32 s12, v43, 37 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s16, 8 -; SI-NEXT: s_add_i32 s14, s30, 3 +; SI-NEXT: s_lshl_b32 s13, s84, 8 +; SI-NEXT: s_add_i32 s14, s37, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s13, s69, 24 +; SI-NEXT: s_lshl_b32 s13, s68, 24 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 36 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s14, v43, 35 -; SI-NEXT: v_readlane_b32 s15, v43, 34 +; SI-NEXT: s_add_i32 s13, s91, 3 +; SI-NEXT: v_readlane_b32 s15, v43, 32 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_lshl_b32 s14, s90, 8 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s14, v43, 31 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_lshl_b32 s15, s15, 16 @@ -205749,29 +205754,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v42, 50 +; SI-NEXT: v_readlane_b32 s14, v42, 49 ; SI-NEXT: s_add_i32 s17, s14, 3 -; SI-NEXT: v_readlane_b32 s15, v42, 49 +; SI-NEXT: v_readlane_b32 s15, v42, 48 ; SI-NEXT: s_and_b32 s14, s17, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_add_i32 s16, s95, 3 +; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s15, s93, 24 +; SI-NEXT: s_lshl_b32 s15, s95, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s14, 0x300 ; SI-NEXT: s_or_b32 s15, s15, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v43, 40 +; SI-NEXT: v_readlane_b32 s15, v43, 36 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: v_readlane_b32 s17, v43, 38 +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: v_readlane_b32 s17, v43, 34 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: v_readlane_b32 s16, v43, 33 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 @@ -205779,15 +205784,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: v_readlane_b32 s16, v42, 47 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 47 -; SI-NEXT: v_readlane_b32 s18, v42, 42 +; SI-NEXT: v_readlane_b32 s17, v42, 46 +; SI-NEXT: v_readlane_b32 s18, v42, 41 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s99, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s17, v42, 42 ; SI-NEXT: s_and_b32 s18, s99, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -205795,15 +205800,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 38 +; SI-NEXT: v_readlane_b32 s17, v42, 37 ; SI-NEXT: s_add_i32 s87, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 35 -; SI-NEXT: v_readlane_b32 s19, v42, 29 +; SI-NEXT: v_readlane_b32 s18, v42, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 28 ; SI-NEXT: s_and_b32 s17, s87, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s23, s19, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 30 +; SI-NEXT: v_readlane_b32 s18, v42, 29 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 @@ -205812,16 +205817,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_add_i32 s40, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 45 +; SI-NEXT: v_readlane_b32 s16, v42, 44 ; SI-NEXT: s_add_i32 s41, s17, 0x3000000 ; SI-NEXT: s_add_i32 s68, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 44 -; SI-NEXT: v_readlane_b32 s18, v42, 39 +; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s18, v42, 38 ; SI-NEXT: s_and_b32 s16, s68, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s96, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 40 +; SI-NEXT: v_readlane_b32 s17, v42, 39 ; SI-NEXT: s_and_b32 s18, s96, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -205829,33 +205834,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 33 +; SI-NEXT: v_readlane_b32 s17, v42, 32 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 31 +; SI-NEXT: v_readlane_b32 s18, v42, 30 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 24 +; SI-NEXT: v_readlane_b32 s18, v42, 23 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 ; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_readlane_b32 s17, v42, 25 +; SI-NEXT: v_readlane_b32 s17, v42, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s43, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 46 +; SI-NEXT: v_readlane_b32 s16, v42, 45 ; SI-NEXT: s_add_i32 s23, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 41 -; SI-NEXT: v_readlane_b32 s18, v42, 37 +; SI-NEXT: v_readlane_b32 s17, v42, 40 +; SI-NEXT: v_readlane_b32 s18, v42, 36 ; SI-NEXT: s_and_b32 s16, s23, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s86, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: v_readlane_b32 s17, v42, 35 ; SI-NEXT: s_and_b32 s18, s86, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205864,15 +205869,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s44, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 28 +; SI-NEXT: v_readlane_b32 s16, v42, 27 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 22 -; SI-NEXT: v_readlane_b32 s18, v42, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 21 +; SI-NEXT: v_readlane_b32 s18, v42, 15 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: v_readlane_b32 s17, v42, 14 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205881,15 +205886,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s45, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 34 +; SI-NEXT: v_readlane_b32 s16, v42, 33 ; SI-NEXT: s_add_i32 s83, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 32 -; SI-NEXT: v_readlane_b32 s18, v42, 26 +; SI-NEXT: v_readlane_b32 s17, v42, 31 +; SI-NEXT: v_readlane_b32 s18, v42, 25 ; SI-NEXT: s_and_b32 s16, s83, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 27 +; SI-NEXT: v_readlane_b32 s17, v42, 26 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205898,15 +205903,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s46, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 23 +; SI-NEXT: v_readlane_b32 s16, v42, 22 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 21 -; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: v_readlane_b32 s17, v42, 20 +; SI-NEXT: v_readlane_b32 s18, v42, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s17, v42, 17 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205915,15 +205920,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s47, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 20 +; SI-NEXT: v_readlane_b32 s16, v42, 19 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 19 -; SI-NEXT: v_readlane_b32 s18, v42, 13 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s18, v42, 12 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: v_readlane_b32 s17, v42, 13 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205932,15 +205937,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s56, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 12 +; SI-NEXT: v_readlane_b32 s16, v42, 11 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 11 -; SI-NEXT: v_readlane_b32 s18, v42, 7 +; SI-NEXT: v_readlane_b32 s17, v42, 10 +; SI-NEXT: v_readlane_b32 s18, v42, 6 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s17, v42, 7 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205949,15 +205954,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s57, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 10 +; SI-NEXT: v_readlane_b32 s16, v42, 9 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 9 -; SI-NEXT: v_readlane_b32 s18, v42, 5 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 6 +; SI-NEXT: v_readlane_b32 s17, v42, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205966,15 +205971,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s58, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s18, v43, 62 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 63 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205983,15 +205988,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s59, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v42, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 1 -; SI-NEXT: v_readlane_b32 s18, v43, 61 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 60 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 61 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206000,15 +206005,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s60, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 60 +; SI-NEXT: v_readlane_b32 s16, v43, 59 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 59 -; SI-NEXT: v_readlane_b32 s18, v43, 55 +; SI-NEXT: v_readlane_b32 s17, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s17, v43, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206017,15 +206022,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s61, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 58 +; SI-NEXT: v_readlane_b32 s16, v43, 57 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 57 -; SI-NEXT: v_readlane_b32 s18, v43, 53 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s18, v43, 52 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 54 +; SI-NEXT: v_readlane_b32 s17, v43, 53 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206034,15 +206039,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s62, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 52 +; SI-NEXT: v_readlane_b32 s16, v43, 51 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 51 -; SI-NEXT: v_readlane_b32 s18, v43, 49 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s18, v43, 48 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s17, v43, 49 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206071,12 +206076,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s16, v43, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_readlane_b32 s17, v43, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: v_readlane_b32 s18, v43, 46 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 48 +; SI-NEXT: v_readlane_b32 s17, v43, 47 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206132,6 +206137,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 ; SI-NEXT: s_add_i32 s75, s16, 0x3000000 ; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 @@ -206141,12 +206147,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 50 ; SI-NEXT: s_lshr_b32 s55, s75, 16 ; SI-NEXT: s_lshr_b32 s53, s73, 16 ; SI-NEXT: s_lshr_b32 s51, s63, 16 @@ -206157,12 +206163,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s22, s45, 16 ; SI-NEXT: s_lshr_b32 s97, s43, 16 ; SI-NEXT: s_lshr_b32 s65, s41, 16 -; SI-NEXT: s_lshr_b32 s19, s15, 16 -; SI-NEXT: s_lshr_b32 s71, s13, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 16 +; SI-NEXT: s_lshr_b32 s21, s13, 16 ; SI-NEXT: s_lshr_b32 s67, s11, 16 ; SI-NEXT: s_lshr_b32 s80, s9, 16 ; SI-NEXT: s_lshr_b32 s81, s7, 16 ; SI-NEXT: s_lshr_b32 s77, s5, 16 +; SI-NEXT: v_writelane_b32 v42, s17, 51 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: s_and_b32 s16, s74, 0xffff ; SI-NEXT: s_lshl_b32 s17, s76, 16 @@ -206203,7 +206210,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s60, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: s_lshl_b32 s17, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206217,7 +206224,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_lshl_b32 s17, s20, 16 +; SI-NEXT: s_lshl_b32 s17, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206231,7 +206238,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s56, 0xffff -; SI-NEXT: s_lshl_b32 s17, s24, 16 +; SI-NEXT: s_lshl_b32 s17, s94, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206245,7 +206252,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s46, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: s_lshl_b32 s17, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206259,7 +206266,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: s_lshl_b32 s17, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206269,11 +206276,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s17, s22, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 50 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206308,7 +206316,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_lshl_b32 s15, s71, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206322,7 +206330,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s71, 16 +; SI-NEXT: s_lshl_b32 s13, s21, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206386,6 +206394,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s19, v42, 51 ; SI-NEXT: v_readlane_b32 s99, v41, 35 ; SI-NEXT: v_readlane_b32 s98, v41, 34 ; SI-NEXT: v_readlane_b32 s97, v41, 33 @@ -206430,6 +206439,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 50 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr55 @@ -206440,32 +206451,32 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: v_writelane_b32 v42, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr67 @@ -213474,24 +213485,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -213512,10 +213505,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr63 @@ -213523,7 +213515,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -213532,13 +213523,33 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -213546,9 +213557,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -213572,7 +213586,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(43) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -213631,180 +213645,195 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -213814,9 +213843,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 ; GFX9-NEXT: .LBB98_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v43, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v40 +; GFX9-NEXT: v_mov_b32_e32 v40, v55 +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] @@ -213828,12 +213861,36 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] @@ -213865,164 +213922,149 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -214039,41 +214081,50 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214082,84 +214133,103 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -214169,16 +214239,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -214188,23 +214258,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 @@ -214212,14 +214275,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -214228,11 +214295,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214241,10 +214308,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214254,11 +214321,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214267,10 +214334,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214280,11 +214347,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214293,10 +214360,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214306,11 +214373,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214319,10 +214386,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214332,53 +214399,34 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -241104,75 +241152,76 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v59 ; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v61 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 @@ -241181,18 +241230,17 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 @@ -241213,23 +241261,33 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v63, v49 +; SI-NEXT: v_mov_b32_e32 v49, v54 +; SI-NEXT: v_mov_b32_e32 v54, v41 +; SI-NEXT: v_mov_b32_e32 v41, v9 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v5, v34 +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v59, v29 -; SI-NEXT: v_mov_b32_e32 v29, v27 -; SI-NEXT: v_mov_b32_e32 v57, v23 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_mov_b32_e32 v57, v15 ; SI-NEXT: v_mov_b32_e32 v60, v3 ; SI-NEXT: v_mov_b32_e32 v62, v4 -; SI-NEXT: v_mov_b32_e32 v63, v49 -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v26, v14 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -241241,7 +241299,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -241252,21 +241310,18 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_or_b32_e32 v61, v14, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -241299,240 +241354,236 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v61, v3, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_or_b32_e32 v24, v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v52, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v55, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v43, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v29, v29, v23 ; SI-NEXT: v_or_b32_e32 v38, v38, v47 ; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v49, v49, v51 ; SI-NEXT: v_or_b32_e32 v45, v45, v50 -; SI-NEXT: v_or_b32_e32 v41, v41, v30 ; SI-NEXT: v_or_b32_e32 v46, v46, v32 +; SI-NEXT: v_alignbit_b32 v50, v61, v50, 16 +; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v11, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 ; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 ; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 -; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; SI-NEXT: v_or_b32_e32 v52, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 +; SI-NEXT: v_or_b32_e32 v55, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v43, v37, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v4, v39, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v39, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; SI-NEXT: v_or_b32_e32 v62, v56, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 -; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_or_b32_e32 v60, v56, v39 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v3, v41, v30 +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 ; SI-NEXT: v_or_b32_e32 v57, v56, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_alignbit_b32 v41, v19, v39, 16 +; SI-NEXT: v_mov_b32_e32 v39, v3 ; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v29, v29, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_or_b32_e32 v59, v56, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_alignbit_b32 v27, v21, v27, 16 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_or_b32_e32 v63, v56, v35 +; SI-NEXT: v_alignbit_b32 v56, v18, v37, 16 ; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v3, v49, v51 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16 -; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -241546,7 +241597,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v34, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 @@ -241559,7 +241610,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 @@ -241572,7 +241623,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -241587,7 +241638,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241597,11 +241648,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241623,11 +241672,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241637,11 +241684,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241669,23 +241714,23 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -241710,13 +241755,13 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 77df03dcdcd9f..09cc571f9de32 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -48006,33 +48006,32 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 @@ -48041,44 +48040,44 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -48088,47 +48087,34 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v54, v9 ; SI-NEXT: v_mov_b32_e32 v55, v11 ; SI-NEXT: v_mov_b32_e32 v41, v13 -; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v43 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -48147,6 +48133,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 @@ -48165,41 +48152,50 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v5, v38, v47 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_or_b32_e32 v9, v38, v47 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_or_b32_e32 v48, v39, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 ; SI-NEXT: v_or_b32_e32 v9, v37, v45 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill @@ -48210,35 +48206,37 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v38, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v41, v39, v43 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v58 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_or_b32_e32 v55, v37, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v55, v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 ; SI-NEXT: v_or_b32_e32 v54, v25, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: v_or_b32_e32 v52, v37, v40 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v52, v37, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 @@ -48249,22 +48247,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v49 ; SI-NEXT: v_or_b32_e32 v62, v25, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v61, v29, v28 +; SI-NEXT: v_or_b32_e32 v49, v29, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v60 -; SI-NEXT: v_or_b32_e32 v49, v21, v27 +; SI-NEXT: v_or_b32_e32 v38, v21, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -48278,6 +48276,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_or_b32_e32 v41, v39, v43 ; SI-NEXT: v_or_b32_e32 v39, v29, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 @@ -48291,14 +48290,19 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v21, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v14 ; SI-NEXT: v_or_b32_e32 v53, v7, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 ; SI-NEXT: v_or_b32_e32 v50, v25, v21 @@ -48306,18 +48310,17 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v35, v13, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 ; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 ; SI-NEXT: v_or_b32_e32 v22, v21, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 ; SI-NEXT: v_or_b32_e32 v24, v24, v21 @@ -48338,19 +48341,15 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 ; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16 ; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v14, v19, v42, 16 ; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16 +; SI-NEXT: v_mov_b32_e32 v58, v38 ; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 ; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 ; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16 -; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 -; SI-NEXT: v_mov_b32_e32 v60, v37 -; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -48359,7 +48358,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v37, v37, v38 ; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen @@ -48383,10 +48382,8 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -48417,7 +48414,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -48459,7 +48456,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -48476,7 +48473,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 @@ -48488,7 +48485,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index c9e5771240078..0d85c2076c0ed 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -51847,79 +51847,79 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v50, v47 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v35 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 @@ -51944,11 +51944,21 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_mov_b32_e32 v8, v60 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v47, v21 ; SI-NEXT: v_mov_b32_e32 v56, v17 ; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v59, v33 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v29, v25 +; SI-NEXT: v_mov_b32_e32 v25, v18 +; SI-NEXT: v_mov_b32_e32 v21, v16 +; SI-NEXT: v_mov_b32_e32 v17, v1 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true @@ -51958,12 +51968,14 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -51973,35 +51985,36 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_or_b32_e32 v3, v3, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_or_b32_e32 v63, v6, v34 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v1, v31, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_or_b32_e32 v31, v12, v34 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -52015,13 +52028,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v12, v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -52031,7 +52042,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v62, v18, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v22, v22, v34 @@ -52043,8 +52054,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v2, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 @@ -52056,79 +52068,89 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v48, v34, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_or_b32_e32 v52, v34, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_or_b32_e32 v55, v34, v35 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_or_b32_e32 v6, v35, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -52136,102 +52158,91 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 ; SI-NEXT: v_or_b32_e32 v25, v25, v24 ; SI-NEXT: v_or_b32_e32 v29, v29, v28 ; SI-NEXT: v_or_b32_e32 v54, v54, v51 ; SI-NEXT: v_or_b32_e32 v50, v50, v30 +; SI-NEXT: v_or_b32_e32 v33, v33, v42 ; SI-NEXT: v_or_b32_e32 v39, v39, v41 ; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 ; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16 ; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16 -; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 -; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v30, 16 +; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 ; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_or_b32_e32 v6, v35, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_or_b32_e32 v58, v35, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_or_b32_e32 v57, v46, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 -; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v56, v35, v17 +; SI-NEXT: v_alignbit_b32 v17, v2, v17, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v57, v46, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v59, v46, v43 -; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v47, v35, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_or_b32_e32 v59, v46, v43 +; SI-NEXT: v_alignbit_b32 v46, v52, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v37, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_alignbit_b32 v21, v11, v21, 16 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 ; SI-NEXT: v_or_b32_e32 v61, v44, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 +; SI-NEXT: v_alignbit_b32 v51, v14, v51, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 ; SI-NEXT: v_or_b32_e32 v36, v36, v45 -; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16 +; SI-NEXT: v_alignbit_b32 v44, v62, v35, 16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v6, v33, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 @@ -52243,7 +52254,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v34, v35 ; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 @@ -52266,9 +52277,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen @@ -52278,11 +52291,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen @@ -52292,11 +52303,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52337,7 +52346,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 @@ -52350,7 +52359,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52361,7 +52370,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 @@ -52373,8 +52382,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52391,10 +52400,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..1b5d4a9c1b929 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -42,14 +42,11 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr20_sgpr21 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr22_sgpr23 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 @@ -58,7 +55,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] @@ -67,7 +64,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 @@ -95,12 +92,12 @@ body: | ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 - ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 - ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58 = COPY renamable $sgpr52_sgpr53_sgpr54 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr56_sgpr57_sgpr58 + ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr76 ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) @@ -165,23 +162,22 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr22_sgpr23, undef renamable $sgpr54_sgpr55, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec @@ -189,14 +185,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec @@ -218,11 +214,17 @@ body: | ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 + ; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr20_sgpr21 + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr22_sgpr23 + ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr24_sgpr25 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 + ; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr38_sgpr39 + ; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr36_sgpr37 + ; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr50_sgpr51 ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 @@ -238,44 +240,42 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr20_sgpr21, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index 81f72b70d1ecb..f63be0c5e8bc0 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -4,7 +4,7 @@ --- # Check that physreg candidate is not used since cannot be spilled in a block, # e.g. before exec mask preamble -# CHECK: , cannot spill all interferences. +# CHECK-NOT: , cannot spill all interferences. name: foo tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..1c5f221dd679b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9742,170 +9742,122 @@ entry: define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: test_limited_sgpr: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_mov_b32 s18, 0 +; GFX6-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[16:17], s[14:15] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240 ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 ; GFX6-NEXT: s_add_u32 s40, s40, s11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_writelane_b32 v1, s0, 0 -; GFX6-NEXT: v_writelane_b32 v1, s1, 1 -; GFX6-NEXT: v_writelane_b32 v1, s2, 2 -; GFX6-NEXT: v_writelane_b32 v1, s3, 3 -; GFX6-NEXT: s_mov_b32 s8, 0x80400 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 -; GFX6-NEXT: s_mov_b32 s2, 0x86a00 -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 -; GFX6-NEXT: s_mov_b32 s2, 0x86600 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 -; GFX6-NEXT: s_mov_b32 s2, 0x86200 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 -; GFX6-NEXT: s_mov_b32 s2, 0x85e00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 -; GFX6-NEXT: s_mov_b32 s2, 0x85a00 +; GFX6-NEXT: s_mov_b32 s0, 0x85e00 +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[16:19], 0 addr64 offset:32 +; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[16:19], 0 addr64 offset:48 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224 +; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 -; GFX6-NEXT: s_mov_b32 s2, 0x85600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208 +; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 -; GFX6-NEXT: s_mov_b32 s2, 0x85200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192 +; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 -; GFX6-NEXT: s_mov_b32 s2, 0x84e00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176 +; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 -; GFX6-NEXT: s_mov_b32 s2, 0x84a00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160 +; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 -; GFX6-NEXT: s_mov_b32 s2, 0x84600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144 +; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x84200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128 +; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x83a00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112 +; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x83200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96 +; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b32 s2, 0x83600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[16:19], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s0, 0 -; GFX6-NEXT: v_writelane_b32 v4, s1, 1 -; GFX6-NEXT: v_writelane_b32 v4, s2, 2 -; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s10, 0x80800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[16:19], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[0:1], exec @@ -9924,22 +9876,76 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_mov_b32 s2, 0x80400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[8:15] +; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: v_writelane_b32 v4, s8, 4 +; GFX6-NEXT: v_writelane_b32 v4, s9, 5 +; GFX6-NEXT: v_writelane_b32 v4, s10, 6 +; GFX6-NEXT: v_writelane_b32 v4, s11, 7 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[16:23] +; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: v_writelane_b32 v4, s8, 4 +; GFX6-NEXT: v_writelane_b32 v4, s9, 5 +; GFX6-NEXT: v_writelane_b32 v4, s10, 6 +; GFX6-NEXT: v_writelane_b32 v4, s11, 7 +; GFX6-NEXT: s_mov_b32 s2, 0x81400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[24:31] +; GFX6-NEXT: ; def s[0:7] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s0, 0 +; GFX6-NEXT: v_writelane_b32 v4, s1, 1 +; GFX6-NEXT: v_writelane_b32 v4, s2, 2 +; GFX6-NEXT: v_writelane_b32 v4, s3, 3 +; GFX6-NEXT: v_writelane_b32 v4, s4, 4 +; GFX6-NEXT: v_writelane_b32 v4, s5, 5 +; GFX6-NEXT: v_writelane_b32 v4, s6, 6 +; GFX6-NEXT: v_writelane_b32 v4, s7, 7 +; GFX6-NEXT: s_mov_b32 s10, 0x81c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[0:3] ; GFX6-NEXT: ;;#ASMEND @@ -9950,33 +9956,28 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s8, 0 -; GFX6-NEXT: v_writelane_b32 v4, s9, 1 -; GFX6-NEXT: v_writelane_b32 v4, s10, 2 -; GFX6-NEXT: v_writelane_b32 v4, s11, 3 -; GFX6-NEXT: v_writelane_b32 v4, s12, 4 -; GFX6-NEXT: v_writelane_b32 v4, s13, 5 -; GFX6-NEXT: v_writelane_b32 v4, s14, 6 -; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x81400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v4, s12, 0 +; GFX6-NEXT: v_writelane_b32 v4, s13, 1 +; GFX6-NEXT: v_writelane_b32 v4, s14, 2 +; GFX6-NEXT: v_writelane_b32 v4, s15, 3 +; GFX6-NEXT: s_mov_b32 s10, 0x82400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: s_mov_b64 s[20:21], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x80c00 +; GFX6-NEXT: s_mov_b32 s22, 0x80400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s22 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -9988,31 +9989,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: s_mov_b64 exec, s[20:21] +; GFX6-NEXT: s_mov_b64 s[20:21], exec +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s16, 0 ; GFX6-NEXT: v_writelane_b32 v4, s17, 1 ; GFX6-NEXT: v_writelane_b32 v4, s18, 2 ; GFX6-NEXT: v_writelane_b32 v4, s19, 3 -; GFX6-NEXT: v_writelane_b32 v4, s20, 4 -; GFX6-NEXT: v_writelane_b32 v4, s21, 5 -; GFX6-NEXT: v_writelane_b32 v4, s22, 6 -; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x81c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s22, 0x82c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s22 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[20:21] +; GFX6-NEXT: s_mov_b64 s[24:25], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x81400 +; GFX6-NEXT: s_mov_b32 s26, 0x80c00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s26 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10024,31 +10021,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s24, 0 -; GFX6-NEXT: v_writelane_b32 v4, s25, 1 -; GFX6-NEXT: v_writelane_b32 v4, s26, 2 -; GFX6-NEXT: v_writelane_b32 v4, s27, 3 -; GFX6-NEXT: v_writelane_b32 v4, s28, 4 -; GFX6-NEXT: v_writelane_b32 v4, s29, 5 -; GFX6-NEXT: v_writelane_b32 v4, s30, 6 -; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x82400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[24:25] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x81c00 +; GFX6-NEXT: s_mov_b32 s36, 0x81400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10060,8 +10039,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10069,12 +10048,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s34, 0x82c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x82800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 3 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10087,10 +10066,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s36, 0x82400 +; GFX6-NEXT: s_mov_b32 s36, 0x81c00 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10108,7 +10088,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s44, 0x82c00 +; GFX6-NEXT: s_mov_b32 s44, 0x82800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10134,84 +10114,102 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: .LBB1_2: ; %ret -; GFX6-NEXT: s_or_b64 exec, exec, vcc -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[6:7], vcc +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80400 +; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s16, v4, 0 +; GFX6-NEXT: v_readlane_b32 s17, v4, 1 +; GFX6-NEXT: v_readlane_b32 s18, v4, 2 +; GFX6-NEXT: v_readlane_b32 s19, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1] -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80800 +; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s12, v4, 0 +; GFX6-NEXT: v_readlane_b32 s13, v4, 1 +; GFX6-NEXT: v_readlane_b32 s14, v4, 2 +; GFX6-NEXT: v_readlane_b32 s15, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s0, 0x86a00 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b32 s0, 0x86200 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b32 s0, 0x86600 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s0, 0x86a00 +; GFX6-NEXT: s_waitcnt expcnt(4) +; GFX6-NEXT: v_mov_b32_e32 v0, v20 +; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_mov_b32_e32 v1, v21 +; GFX6-NEXT: v_mov_b32_e32 v2, v22 +; GFX6-NEXT: v_mov_b32_e32 v3, v23 +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s0, 0x86600 +; GFX6-NEXT: v_mov_b32_e32 v23, v3 +; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x86200 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: v_mov_b32_e32 v22, v2 +; GFX6-NEXT: v_mov_b32_e32 v21, v1 +; GFX6-NEXT: v_mov_b32_e32 v20, v0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: .LBB1_2: ; %ret +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b32 s0, 0x85e00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[14:15], s[18:19] ; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10219,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10227,7 +10225,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10235,7 +10233,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10243,7 +10241,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10251,7 +10249,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10259,23 +10257,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83a00 +; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 +; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10283,7 +10281,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10291,15 +10289,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[12:15], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[12:15], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[12:15], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[12:15], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll index 3913e93b83a66..1dddc29deae25 100644 --- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll @@ -31,22 +31,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: .cfi_offset %ebp, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.1: ## %bb116.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.2: ## %bb52.i.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.3: ## %bb142.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.4: +; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi ; CHECK-NEXT: movb $1, %bh ; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_21: ## %bb7806 +; CHECK-NEXT: LBB0_23: ## %bb7806 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp16: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -57,50 +58,50 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: LBB0_5: ## %bb3261 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: cmpl $37, 0 -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: ## %bb.6: ## %bb3306 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: ## %bb.8: ## %bb3306 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll __ZN12wxStringBaseaSEPKw ; CHECK-NEXT: Ltmp1: ## EH_LABEL -; CHECK-NEXT: ## %bb.7: ## %bb3314 +; CHECK-NEXT: ## %bb.9: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: movl 0, %eax ; CHECK-NEXT: cmpl $121, %eax -; CHECK-NEXT: ja LBB0_25 -; CHECK-NEXT: ## %bb.8: ## %bb3314 +; CHECK-NEXT: ja LBB0_6 +; CHECK-NEXT: ## %bb.10: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4) -; CHECK-NEXT: LBB0_10: ## %bb5809 +; CHECK-NEXT: LBB0_12: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: ## %bb.11: ## %bb5809 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: ## %bb.13: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb %bh, %bh -; CHECK-NEXT: je LBB0_25 -; CHECK-NEXT: ## %bb.12: ## %bb91.i8504 +; CHECK-NEXT: je LBB0_6 +; CHECK-NEXT: ## %bb.14: ## %bb91.i8504 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_14 -; CHECK-NEXT: ## %bb.13: ## %bb155.i8541 +; CHECK-NEXT: je LBB0_16 +; CHECK-NEXT: ## %bb.15: ## %bb155.i8541 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll _gmtime_r ; CHECK-NEXT: Ltmp5: ## EH_LABEL -; CHECK-NEXT: LBB0_14: ## %bb182.i8560 +; CHECK-NEXT: LBB0_16: ## %bb182.i8560 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_15 -; CHECK-NEXT: ## %bb.16: ## %bb278.i8617 +; CHECK-NEXT: je LBB0_17 +; CHECK-NEXT: ## %bb.18: ## %bb278.i8617 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: je LBB0_18 -; CHECK-NEXT: ## %bb.17: ## %bb440.i8663 +; CHECK-NEXT: je LBB0_20 +; CHECK-NEXT: ## %bb.19: ## %bb440.i8663 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax @@ -113,11 +114,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5 ; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_ ; CHECK-NEXT: Ltmp7: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_18 -; CHECK-NEXT: LBB0_15: ## %bb187.i8591 +; CHECK-NEXT: jmp LBB0_20 +; CHECK-NEXT: LBB0_17: ## %bb187.i8591 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: LBB0_18: ## %invcont5814 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: LBB0_20: ## %invcont5814 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -126,7 +127,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp9: ## EH_LABEL -; CHECK-NEXT: ## %bb.19: ## %invcont5831 +; CHECK-NEXT: ## %bb.21: ## %invcont5831 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp10: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -136,7 +137,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN12wxStringBase10ConcatSelfEmPKwm ; CHECK-NEXT: Ltmp11: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_9: ## %bb5657 +; CHECK-NEXT: LBB0_11: ## %bb5657 ; CHECK-NEXT: Ltmp13: ## EH_LABEL ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -144,8 +145,8 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE ; CHECK-NEXT: Ltmp14: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_20: ## %bb5968 +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_22: ## %bb5968 ; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -153,23 +154,24 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp3: ## EH_LABEL -; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i +; CHECK-NEXT: LBB0_6: ## %bb3267 ; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: LBB0_7: ## %bb115.critedge.i ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl $4 -; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp +; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp ; CHECK-NEXT: Ltmp15: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_24: ## %lpad8185 +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_26: ## %lpad8185 ; CHECK-NEXT: Ltmp12: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_22: ## %lpad.loopexit +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_24: ## %lpad.loopexit ; CHECK-NEXT: Ltmp18: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: entry: br i1 %foo, label %bb116.i, label %bb115.critedge.i From a3a0066226ee9aa0670c7d1dbb8131d7cd633de2 Mon Sep 17 00:00:00 2001 From: Yuanke Luo Date: Wed, 19 Nov 2025 08:36:02 +0800 Subject: [PATCH 2/4] Revise the spill save/restor test case --- .../test/CodeGen/AMDGPU/spill-before-exec.mir | 91 ++++++++++++++++++- 1 file changed, 86 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index f63be0c5e8bc0..a44f5b477f052 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -1,10 +1,8 @@ -# REQUIRES: asserts -# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -debug-only=regalloc -run-pass=greedy -filetype=null %s 2>&1 | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s --- -# Check that physreg candidate is not used since cannot be spilled in a block, -# e.g. before exec mask preamble -# CHECK-NOT: , cannot spill all interferences. +# Check that spill save/restore should be inserted after $exec mask is defined. name: foo tracksRegLiveness: true @@ -12,6 +10,88 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0 bb.0: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 @@ -68,6 +148,7 @@ body: | S_BRANCH %bb.4 bb.4: + $exec = IMPLICIT_DEF S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc From ea0c4465fa5d2493afc1336f865f2031ff7843b3 Mon Sep 17 00:00:00 2001 From: Yuanke Luo Date: Wed, 19 Nov 2025 09:19:01 +0800 Subject: [PATCH 3/4] Add comments derived from https://reviews.llvm.org/D52052 --- llvm/lib/CodeGen/SplitKit.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index f87c5f43ccaaa..f27ff674dcf8c 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -163,6 +163,26 @@ bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI, if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual()) continue; + // For the AMDGPU target if a MBB contains exec mask restore preamble, + // SplitEditor may get state when it cannot insert a spill instruction + // at the begin of the MBB. + // E.g. for a MIR + // bb.100: + // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, + // implicit $exec + // ... + // use %1 + // If the regalloc try to allocate a virtreg to the physreg already + // assigned to virtreg %1 and the pyhsreg is computed as the best + // candidate for split, it may insert COPY instruction. + // bb.100: + // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, + // implicit $exec + // %2 = COPY %orig + // ... + // use %1 + // Thus %1 and %orig still have interference. We may add cost for the + // physreg candidate or abandon the candidate. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); From 065ba28c368970d4846bf5190a5fbb0380ab8b24 Mon Sep 17 00:00:00 2001 From: Yuanke Luo Date: Thu, 20 Nov 2025 10:02:34 +0800 Subject: [PATCH 4/4] Add test case Change original test case for checking "cannot spill all interferences" log and add another test case for full output checking. --- .../test/CodeGen/AMDGPU/spill-before-exec.mir | 96 ++-------- .../CodeGen/AMDGPU/spill-before-exec2.mir | 167 ++++++++++++++++++ 2 files changed, 177 insertions(+), 86 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index a44f5b477f052..24c7480240608 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -1,8 +1,10 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 -# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s +# REQUIRES: asserts +# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -debug-only=regalloc -run-pass=greedy -filetype=null %s 2>&1 | FileCheck %s --- -# Check that spill save/restore should be inserted after $exec mask is defined. +# Check that physreg candidate is not used since cannot be spilled in a block, +# e.g. before exec mask preamble +# CHECK: , cannot spill all interferences. name: foo tracksRegLiveness: true @@ -10,88 +12,6 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | - ; CHECK-LABEL: name: foo - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 - ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] - ; CHECK-NEXT: S_BRANCH %bb.1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.4 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.3: - ; CHECK-NEXT: successors: %bb.4(0x80000000) - ; CHECK-NEXT: liveins: $sgpr102_sgpr103 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc - ; CHECK-NEXT: S_BRANCH %bb.4 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.4: - ; CHECK-NEXT: $exec = IMPLICIT_DEF - ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0 bb.0: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 @@ -122,11 +42,13 @@ body: | %24:sgpr_128 = COPY %1 %25:sgpr_128 = COPY %1 %26:sgpr_128 = COPY %1 + %27:sgpr_128 = COPY %1 S_BRANCH %bb.1 bb.1: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 + %27 = IMPLICIT_DEF implicit-def $exec %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec $exec = S_XOR_B64_term $exec, %0, implicit-def $scc S_CBRANCH_EXECZ %bb.3, implicit $exec @@ -135,6 +57,7 @@ body: | bb.2: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 + %27 = IMPLICIT_DEF implicit-def $exec %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec $exec = S_XOR_B64_term $exec, %0, implicit-def $scc S_CBRANCH_EXECZ %bb.3, implicit $exec @@ -143,12 +66,12 @@ body: | bb.3: liveins: $sgpr102_sgpr103 + %27 = IMPLICIT_DEF implicit-def $exec %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec $exec = S_XOR_B64_term $exec, %0, implicit-def $scc S_BRANCH %bb.4 bb.4: - $exec = IMPLICIT_DEF S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc @@ -162,6 +85,7 @@ body: | S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %26.sub0_sub1, %27.sub2_sub3, implicit-def $scc $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir new file mode 100644 index 0000000000000..a44f5b477f052 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir @@ -0,0 +1,167 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s + +--- +# Check that spill save/restore should be inserted after $exec mask is defined. + +name: foo +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0 + bb.0: + liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 + + %0:sreg_64 = COPY $sgpr102_sgpr103 + %1:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + %2:sgpr_128 = COPY %1 + %3:sgpr_128 = COPY %1 + %4:sgpr_128 = COPY %1 + %5:sgpr_128 = COPY %1 + %6:sgpr_128 = COPY %1 + %7:sgpr_128 = COPY %1 + %8:sgpr_128 = COPY %1 + %9:sgpr_128 = COPY %1 + %10:sgpr_128 = COPY %1 + %11:sgpr_128 = COPY %1 + %12:sgpr_128 = COPY %1 + %13:sgpr_128 = COPY %1 + %14:sgpr_128 = COPY %1 + %15:sgpr_128 = COPY %1 + %16:sgpr_128 = COPY %1 + %17:sgpr_128 = COPY %1 + %18:sgpr_128 = COPY %1 + %19:sgpr_128 = COPY %1 + %20:sgpr_128 = COPY %1 + %21:sgpr_128 = COPY %1 + %22:sgpr_128 = COPY %1 + %23:sgpr_128 = COPY %1 + %24:sgpr_128 = COPY %1 + %25:sgpr_128 = COPY %1 + %26:sgpr_128 = COPY %1 + S_BRANCH %bb.1 + + bb.1: + liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 + + %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec + $exec = S_XOR_B64_term $exec, %0, implicit-def $scc + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.2 + + bb.2: + liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 + + %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec + $exec = S_XOR_B64_term $exec, %0, implicit-def $scc + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.4 + + bb.3: + liveins: $sgpr102_sgpr103 + + %0:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec + $exec = S_XOR_B64_term $exec, %0, implicit-def $scc + S_BRANCH %bb.4 + + bb.4: + $exec = IMPLICIT_DEF + S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %7.sub0_sub1, %8.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %9.sub0_sub1, %10.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %11.sub0_sub1, %12.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %13.sub0_sub1, %14.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %15.sub0_sub1, %16.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %17.sub0_sub1, %18.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %19.sub0_sub1, %20.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %21.sub0_sub1, %22.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %23.sub0_sub1, %24.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %25.sub0_sub1, %26.sub2_sub3, implicit-def $scc + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0 +...