diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 1bc7607890328..a6a65b444b466 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -773,8 +773,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf, // Abort if the spill cannot be inserted at the MBB' start if (((BC.Entry == SpillPlacement::MustSpill) || (BC.Entry == SpillPlacement::PrefSpill)) && - SlotIndex::isEarlierInstr(BI.FirstInstr, - SA->getFirstSplitPoint(BC.Number))) + !SA->canSplitBeforeProlog(BC.Number)) return false; } @@ -829,11 +828,7 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf, BCS[B].Number = Number; // Abort if the spill cannot be inserted at the MBB' start - MachineBasicBlock *MBB = MF->getBlockNumbered(Number); - auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr(); - if (FirstNonDebugInstr != MBB->end() && - SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr), - SA->getFirstSplitPoint(Number))) + if (!SA->canSplitBeforeProlog(Number)) return false; // Interference for the live-in value. if (Intf.first() <= Indexes->getMBBStartIdx(Number)) diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index 8ec4bfbb5a330..f27ff674dcf8c 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -147,6 +147,54 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI, return LIS.getInstructionFromIndex(LIP); } +bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI, + const MachineBasicBlock &MBB) { + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + + for (auto &MI : MBB) { + if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() || + MI.isPseudoProbe()) + continue; + + if (!TII->isBasicBlockPrologue(MI)) + return true; + + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual()) + continue; + + // For the AMDGPU target if a MBB contains exec mask restore preamble, + // SplitEditor may get state when it cannot insert a spill instruction + // at the begin of the MBB. + // E.g. for a MIR + // bb.100: + // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, + // implicit $exec + // ... + // use %1 + // If the regalloc try to allocate a virtreg to the physreg already + // assigned to virtreg %1 and the pyhsreg is computed as the best + // candidate for split, it may insert COPY instruction. + // bb.100: + // %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc, + // implicit $exec + // %2 = COPY %orig + // ... + // use %1 + // Thus %1 and %orig still have interference. We may add cost for the + // physreg candidate or abandon the candidate. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); + const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg()); + if (TRI->getCommonSubClass(RC, CurRC)) + return false; + } + } + + return true; +} + //===----------------------------------------------------------------------===// // Split Analysis //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h index de255911268f2..a9fc921534d0e 100644 --- a/llvm/lib/CodeGen/SplitKit.h +++ b/llvm/lib/CodeGen/SplitKit.h @@ -89,6 +89,9 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis { return Res; } + /// Return true if we can split \pCurLI before \pMBB's prolog. + bool canSplitBeforeProlog(const LiveInterval &CurLI, + const MachineBasicBlock &MBB); }; /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting @@ -247,6 +250,11 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis { SlotIndex getFirstSplitPoint(unsigned Num) { return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num)); } + + bool canSplitBeforeProlog(unsigned Num) { + MachineBasicBlock *BB = MF.getBlockNumbered(Num); + return IPA.canSplitBeforeProlog(*CurLI, *BB); + } }; /// SplitEditor - Edit machine code and LiveIntervals for live range diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index d3ebd92f0677b..5ff30224f87ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -154717,13 +154717,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 ; SI-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s73, s21 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_writelane_b32 v44, s19, 0 ; SI-NEXT: v_writelane_b32 v44, s18, 1 ; SI-NEXT: v_writelane_b32 v44, s17, 2 ; SI-NEXT: v_writelane_b32 v44, s16, 3 -; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: v_writelane_b32 v41, s31, 1 ; SI-NEXT: v_writelane_b32 v41, s34, 2 ; SI-NEXT: v_writelane_b32 v41, s35, 3 @@ -154747,9 +154747,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s69, 21 ; SI-NEXT: v_writelane_b32 v41, s70, 22 ; SI-NEXT: v_writelane_b32 v41, s71, 23 -; SI-NEXT: s_mov_b32 s74, s29 -; SI-NEXT: s_mov_b32 s78, s28 -; SI-NEXT: s_mov_b32 s76, s27 +; SI-NEXT: s_mov_b32 s57, s28 +; SI-NEXT: s_mov_b32 s47, s27 ; SI-NEXT: v_writelane_b32 v41, s80, 24 ; SI-NEXT: v_writelane_b32 v41, s81, 25 ; SI-NEXT: v_writelane_b32 v41, s82, 26 @@ -154759,7 +154758,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: s_mov_b32 s47, s26 ; SI-NEXT: v_writelane_b32 v41, s97, 33 ; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: v_writelane_b32 v41, s99, 35 @@ -154769,95 +154767,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 +; SI-NEXT: v_readfirstlane_b32 s89, v3 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s90, v9 +; SI-NEXT: v_writelane_b32 v42, s89, 0 +; SI-NEXT: v_readfirstlane_b32 s91, v10 +; SI-NEXT: v_writelane_b32 v42, s90, 1 +; SI-NEXT: v_readfirstlane_b32 s92, v8 +; SI-NEXT: v_writelane_b32 v42, s91, 2 +; SI-NEXT: v_readfirstlane_b32 s93, v7 +; SI-NEXT: v_writelane_b32 v42, s92, 3 +; SI-NEXT: v_readfirstlane_b32 s94, v13 +; SI-NEXT: v_writelane_b32 v42, s93, 4 +; SI-NEXT: v_readfirstlane_b32 s95, v14 +; SI-NEXT: v_writelane_b32 v42, s94, 5 +; SI-NEXT: v_writelane_b32 v42, s95, 6 +; SI-NEXT: v_readfirstlane_b32 s30, v17 +; SI-NEXT: v_readfirstlane_b32 s31, v18 +; SI-NEXT: v_readfirstlane_b32 s34, v16 +; SI-NEXT: v_readfirstlane_b32 s35, v15 +; SI-NEXT: v_readfirstlane_b32 s36, v21 ; SI-NEXT: v_readfirstlane_b32 s37, v22 -; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane ; SI-NEXT: v_readfirstlane_b32 s38, v20 -; SI-NEXT: v_writelane_b32 v43, s37, 0 ; SI-NEXT: v_readfirstlane_b32 s39, v19 -; SI-NEXT: v_writelane_b32 v43, s38, 1 ; SI-NEXT: v_readfirstlane_b32 s48, v25 -; SI-NEXT: v_writelane_b32 v43, s39, 2 ; SI-NEXT: v_readfirstlane_b32 s49, v26 -; SI-NEXT: v_writelane_b32 v43, s48, 3 ; SI-NEXT: v_readfirstlane_b32 s50, v24 -; SI-NEXT: v_writelane_b32 v43, s49, 4 ; SI-NEXT: v_readfirstlane_b32 s51, v23 -; SI-NEXT: v_writelane_b32 v43, s50, 5 ; SI-NEXT: v_readfirstlane_b32 s52, v29 -; SI-NEXT: v_writelane_b32 v43, s51, 6 ; SI-NEXT: v_readfirstlane_b32 s53, v30 -; SI-NEXT: v_writelane_b32 v43, s52, 7 -; SI-NEXT: v_readfirstlane_b32 s54, v28 -; SI-NEXT: v_writelane_b32 v43, s53, 8 -; SI-NEXT: v_readfirstlane_b32 s55, v27 -; SI-NEXT: v_writelane_b32 v43, s54, 9 -; SI-NEXT: v_writelane_b32 v43, s55, 10 -; SI-NEXT: s_mov_b32 s57, s24 -; SI-NEXT: v_readfirstlane_b32 s16, v1 -; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s6, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: v_writelane_b32 v44, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v44, s4, 4 +; SI-NEXT: v_writelane_b32 v44, s4, 5 ; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v44, s4, 5 -; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v44, s4, 6 -; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v34 ; SI-NEXT: v_writelane_b32 v44, s4, 7 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v44, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 8 +; SI-NEXT: v_writelane_b32 v44, s4, 9 ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v44, s4, 9 +; SI-NEXT: v_writelane_b32 v44, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 10 +; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_readfirstlane_b32 s54, v28 +; SI-NEXT: v_readfirstlane_b32 s55, v27 +; SI-NEXT: s_mov_b32 s6, s23 +; SI-NEXT: s_mov_b32 s23, s21 +; SI-NEXT: s_mov_b32 s58, s26 +; SI-NEXT: s_mov_b32 s40, s25 +; SI-NEXT: s_mov_b32 s25, s24 +; SI-NEXT: v_readfirstlane_b32 s16, v1 +; SI-NEXT: v_readfirstlane_b32 s17, v2 ; SI-NEXT: v_readfirstlane_b32 s18, v5 ; SI-NEXT: v_readfirstlane_b32 s19, v6 ; SI-NEXT: v_readfirstlane_b32 s77, v4 -; SI-NEXT: v_readfirstlane_b32 s89, v3 -; SI-NEXT: v_readfirstlane_b32 s90, v9 -; SI-NEXT: v_readfirstlane_b32 s91, v10 -; SI-NEXT: v_readfirstlane_b32 s92, v8 -; SI-NEXT: v_readfirstlane_b32 s93, v7 -; SI-NEXT: v_readfirstlane_b32 s94, v13 -; SI-NEXT: v_readfirstlane_b32 s95, v14 -; SI-NEXT: v_readfirstlane_b32 s30, v17 -; SI-NEXT: v_readfirstlane_b32 s31, v18 -; SI-NEXT: v_readfirstlane_b32 s34, v16 -; SI-NEXT: v_readfirstlane_b32 s35, v15 -; SI-NEXT: v_readfirstlane_b32 s36, v21 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s26, v53 +; SI-NEXT: v_readfirstlane_b32 s46, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s61, v55 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s24, v40 +; SI-NEXT: v_readfirstlane_b32 s62, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 11 +; SI-NEXT: v_writelane_b32 v44, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 12 +; SI-NEXT: v_writelane_b32 v44, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 13 +; SI-NEXT: v_writelane_b32 v44, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 14 +; SI-NEXT: v_writelane_b32 v44, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 15 +; SI-NEXT: v_writelane_b32 v44, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -154867,40 +154871,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 +; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s75, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v44, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s21, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v44, s4, 16 +; SI-NEXT: v_writelane_b32 v44, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v44, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s40, v35 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v44, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s61, v36 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v44, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s63, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v44, s4, 23 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v44, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s59, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v44, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s56, v38 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v44, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s43, v39 +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v44, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s46, v48 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v44, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s42, v49 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: v_writelane_b32 v44, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s13, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: v_writelane_b32 v44, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s45, v51 +; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 @@ -154908,45 +154923,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v44, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s88, v32 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v44, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s79, v33 +; SI-NEXT: v_readfirstlane_b32 s4, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: v_writelane_b32 v44, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v44, s4, 18 +; SI-NEXT: v_writelane_b32 v44, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v44, s4, 19 +; SI-NEXT: v_writelane_b32 v44, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v44, s4, 20 +; SI-NEXT: v_writelane_b32 v44, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v44, s4, 21 +; SI-NEXT: v_readfirstlane_b32 s43, v37 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v44, s4, 22 +; SI-NEXT: v_writelane_b32 v44, s4, 36 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v44, s4, 23 +; SI-NEXT: v_writelane_b32 v44, s4, 37 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v44, s4, 24 +; SI-NEXT: v_writelane_b32 v44, s4, 38 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v44, s4, 25 +; SI-NEXT: v_writelane_b32 v44, s4, 39 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v44, s4, 26 +; SI-NEXT: v_writelane_b32 v44, s4, 40 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v44, s4, 27 +; SI-NEXT: v_writelane_b32 v44, s4, 41 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s4, v51 -; SI-NEXT: v_writelane_b32 v44, s4, 28 +; SI-NEXT: v_writelane_b32 v44, s4, 42 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -154962,41 +154979,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v44, s4, 29 +; SI-NEXT: v_writelane_b32 v44, s4, 43 +; SI-NEXT: v_writelane_b32 v44, s22, 44 +; SI-NEXT: v_writelane_b32 v44, s6, 45 +; SI-NEXT: v_writelane_b32 v44, s23, 46 +; SI-NEXT: v_writelane_b32 v44, s20, 47 +; SI-NEXT: v_writelane_b32 v44, s58, 48 +; SI-NEXT: v_writelane_b32 v44, s47, 49 +; SI-NEXT: v_writelane_b32 v44, s40, 50 +; SI-NEXT: v_writelane_b32 v44, s25, 51 +; SI-NEXT: v_writelane_b32 v44, s29, 52 +; SI-NEXT: v_writelane_b32 v44, s57, 53 +; SI-NEXT: v_writelane_b32 v44, s62, 54 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v44, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v44, s4, 31 -; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v44, s4, 32 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: v_writelane_b32 v44, s4, 33 -; SI-NEXT: v_writelane_b32 v44, s22, 34 -; SI-NEXT: v_writelane_b32 v44, s23, 35 -; SI-NEXT: v_writelane_b32 v44, s73, 36 -; SI-NEXT: v_writelane_b32 v44, s20, 37 -; SI-NEXT: v_writelane_b32 v44, s47, 38 -; SI-NEXT: v_writelane_b32 v44, s76, 39 -; SI-NEXT: v_writelane_b32 v44, s25, 40 -; SI-NEXT: v_writelane_b32 v44, s57, 41 -; SI-NEXT: v_writelane_b32 v44, s74, 42 -; SI-NEXT: v_writelane_b32 v44, s78, 43 -; SI-NEXT: v_writelane_b32 v44, s24, 44 -; SI-NEXT: v_writelane_b32 v44, s16, 45 -; SI-NEXT: v_writelane_b32 v44, s17, 46 -; SI-NEXT: v_writelane_b32 v44, s18, 47 -; SI-NEXT: v_writelane_b32 v44, s19, 48 -; SI-NEXT: v_writelane_b32 v44, s77, 49 -; SI-NEXT: v_writelane_b32 v44, s89, 50 -; SI-NEXT: v_writelane_b32 v44, s90, 51 -; SI-NEXT: v_writelane_b32 v44, s91, 52 -; SI-NEXT: v_writelane_b32 v44, s92, 53 -; SI-NEXT: v_writelane_b32 v44, s93, 54 -; SI-NEXT: v_writelane_b32 v44, s94, 55 -; SI-NEXT: v_writelane_b32 v44, s95, 56 +; SI-NEXT: v_readfirstlane_b32 s21, v52 +; SI-NEXT: v_writelane_b32 v44, s61, 55 +; SI-NEXT: v_writelane_b32 v44, s21, 56 +; SI-NEXT: v_writelane_b32 v44, s26, 57 +; SI-NEXT: v_writelane_b32 v44, s46, 58 +; SI-NEXT: v_writelane_b32 v44, s16, 59 +; SI-NEXT: v_writelane_b32 v44, s17, 60 +; SI-NEXT: v_writelane_b32 v44, s18, 61 +; SI-NEXT: v_writelane_b32 v44, s19, 62 +; SI-NEXT: v_writelane_b32 v44, s77, 63 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s58, v33 +; SI-NEXT: v_readfirstlane_b32 s13, v33 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s10, v34 ; SI-NEXT: s_waitcnt vmcnt(8) @@ -155004,7 +155011,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s28, v31 ; SI-NEXT: v_readfirstlane_b32 s27, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s29, v36 +; SI-NEXT: v_readfirstlane_b32 s56, v36 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s69, v37 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -155035,17 +155042,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_readfirstlane_b32 vcc_lo, v12 ; SI-NEXT: v_readfirstlane_b32 vcc_hi, v11 -; SI-NEXT: v_writelane_b32 v44, vcc_lo, 57 -; SI-NEXT: v_writelane_b32 v44, vcc_hi, 58 -; SI-NEXT: v_writelane_b32 v44, s30, 59 -; SI-NEXT: v_writelane_b32 v44, s31, 60 -; SI-NEXT: v_writelane_b32 v44, s34, 61 -; SI-NEXT: v_writelane_b32 v44, s35, 62 -; SI-NEXT: v_writelane_b32 v44, s36, 63 +; SI-NEXT: v_writelane_b32 v42, vcc_lo, 7 +; SI-NEXT: v_writelane_b32 v42, vcc_hi, 8 +; SI-NEXT: v_writelane_b32 v42, s30, 9 +; SI-NEXT: v_writelane_b32 v42, s31, 10 +; SI-NEXT: v_writelane_b32 v42, s34, 11 +; SI-NEXT: v_writelane_b32 v42, s35, 12 +; SI-NEXT: v_writelane_b32 v42, s36, 13 +; SI-NEXT: v_writelane_b32 v42, s37, 14 +; SI-NEXT: v_writelane_b32 v42, s38, 15 +; SI-NEXT: v_writelane_b32 v42, s39, 16 +; SI-NEXT: v_writelane_b32 v42, s48, 17 +; SI-NEXT: v_writelane_b32 v42, s49, 18 +; SI-NEXT: v_writelane_b32 v42, s50, 19 +; SI-NEXT: v_writelane_b32 v42, s51, 20 +; SI-NEXT: v_writelane_b32 v42, s52, 21 +; SI-NEXT: v_writelane_b32 v42, s53, 22 +; SI-NEXT: v_writelane_b32 v42, s54, 23 +; SI-NEXT: v_writelane_b32 v42, s55, 24 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s60, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s62, v32 +; SI-NEXT: v_readfirstlane_b32 s63, v32 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s83, v33 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -155069,7 +155087,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s26, v48 +; SI-NEXT: v_readfirstlane_b32 s75, v48 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s15, v49 ; SI-NEXT: s_waitcnt vmcnt(9) @@ -155103,48 +155121,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_readfirstlane_b32 s65, v48 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_readfirstlane_b32 s64, v49 -; SI-NEXT: v_writelane_b32 v43, s64, 11 +; SI-NEXT: v_writelane_b32 v42, s64, 25 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_readfirstlane_b32 s67, v50 -; SI-NEXT: v_writelane_b32 v43, s65, 12 +; SI-NEXT: v_writelane_b32 v42, s65, 26 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s84, v51 -; SI-NEXT: v_writelane_b32 v43, s67, 13 -; SI-NEXT: v_writelane_b32 v43, s84, 14 -; SI-NEXT: v_writelane_b32 v43, s85, 15 -; SI-NEXT: v_writelane_b32 v43, s86, 16 -; SI-NEXT: v_writelane_b32 v43, s87, 17 -; SI-NEXT: v_writelane_b32 v43, s8, 18 -; SI-NEXT: v_writelane_b32 v43, s99, 19 -; SI-NEXT: v_writelane_b32 v43, s12, 20 -; SI-NEXT: v_writelane_b32 v43, s44, 21 -; SI-NEXT: v_writelane_b32 v43, s97, 22 -; SI-NEXT: v_writelane_b32 v43, s15, 23 -; SI-NEXT: v_writelane_b32 v43, s96, 24 -; SI-NEXT: v_writelane_b32 v43, s98, 25 -; SI-NEXT: v_writelane_b32 v43, s83, 26 -; SI-NEXT: v_writelane_b32 v43, s82, 27 -; SI-NEXT: v_writelane_b32 v43, s9, 28 -; SI-NEXT: v_writelane_b32 v43, s81, 29 -; SI-NEXT: v_writelane_b32 v43, s80, 30 -; SI-NEXT: v_writelane_b32 v43, s7, 31 -; SI-NEXT: v_writelane_b32 v43, s72, 32 -; SI-NEXT: v_writelane_b32 v43, s26, 33 -; SI-NEXT: v_writelane_b32 v43, s41, 34 -; SI-NEXT: v_writelane_b32 v43, s14, 35 -; SI-NEXT: v_writelane_b32 v43, s69, 36 -; SI-NEXT: v_writelane_b32 v43, s71, 37 -; SI-NEXT: v_writelane_b32 v43, s70, 38 -; SI-NEXT: v_writelane_b32 v43, s68, 39 -; SI-NEXT: v_writelane_b32 v43, s60, 40 -; SI-NEXT: v_writelane_b32 v43, s62, 41 -; SI-NEXT: v_writelane_b32 v43, s11, 42 -; SI-NEXT: v_writelane_b32 v43, s10, 43 -; SI-NEXT: v_writelane_b32 v43, s58, 44 -; SI-NEXT: v_writelane_b32 v43, s66, 45 -; SI-NEXT: v_writelane_b32 v43, s29, 46 -; SI-NEXT: v_writelane_b32 v43, s28, 47 -; SI-NEXT: v_writelane_b32 v43, s27, 48 +; SI-NEXT: v_writelane_b32 v42, s67, 27 +; SI-NEXT: v_writelane_b32 v42, s84, 28 +; SI-NEXT: v_writelane_b32 v42, s85, 29 +; SI-NEXT: v_writelane_b32 v42, s86, 30 +; SI-NEXT: v_writelane_b32 v42, s87, 31 +; SI-NEXT: v_writelane_b32 v42, s8, 32 +; SI-NEXT: v_writelane_b32 v42, s99, 33 +; SI-NEXT: v_writelane_b32 v42, s12, 34 +; SI-NEXT: v_writelane_b32 v42, s44, 35 +; SI-NEXT: v_writelane_b32 v42, s97, 36 +; SI-NEXT: v_writelane_b32 v42, s15, 37 +; SI-NEXT: v_writelane_b32 v42, s96, 38 +; SI-NEXT: v_writelane_b32 v42, s98, 39 +; SI-NEXT: v_writelane_b32 v42, s83, 40 +; SI-NEXT: v_writelane_b32 v42, s82, 41 +; SI-NEXT: v_writelane_b32 v42, s9, 42 +; SI-NEXT: v_writelane_b32 v42, s81, 43 +; SI-NEXT: v_writelane_b32 v42, s80, 44 +; SI-NEXT: v_writelane_b32 v42, s7, 45 +; SI-NEXT: v_writelane_b32 v42, s72, 46 +; SI-NEXT: v_writelane_b32 v42, s75, 47 +; SI-NEXT: v_writelane_b32 v42, s41, 48 +; SI-NEXT: v_writelane_b32 v42, s14, 49 +; SI-NEXT: v_writelane_b32 v42, s69, 50 +; SI-NEXT: v_writelane_b32 v42, s71, 51 +; SI-NEXT: v_writelane_b32 v42, s70, 52 +; SI-NEXT: v_writelane_b32 v42, s68, 53 +; SI-NEXT: v_writelane_b32 v42, s59, 54 +; SI-NEXT: v_writelane_b32 v42, s63, 55 +; SI-NEXT: v_writelane_b32 v42, s11, 56 +; SI-NEXT: v_writelane_b32 v42, s10, 57 +; SI-NEXT: v_writelane_b32 v42, s13, 58 +; SI-NEXT: v_writelane_b32 v42, s66, 59 +; SI-NEXT: v_writelane_b32 v42, s56, 60 +; SI-NEXT: v_writelane_b32 v42, s28, 61 +; SI-NEXT: v_writelane_b32 v42, s27, 62 ; SI-NEXT: s_cbranch_scc0 .LBB89_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_readlane_b32 s4, v44, 3 @@ -155153,107 +155171,97 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 58 +; SI-NEXT: v_writelane_b32 v43, s4, 10 ; SI-NEXT: v_readlane_b32 s4, v44, 1 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: v_readlane_b32 s5, v44, 0 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_writelane_b32 v43, s4, 59 +; SI-NEXT: s_or_b32 s45, s5, s4 ; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_lshl_b32 s5, s23, 8 +; SI-NEXT: s_or_b32 s24, s4, s5 ; SI-NEXT: s_and_b32 s5, s22, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_mov_b32 s22, s6 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 60 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s42, s6, s5 +; SI-NEXT: s_and_b32 s5, s25, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s25, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 61 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s47, 0xff +; SI-NEXT: s_lshl_b32 s6, s40, 24 +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_and_b32 s5, s58, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s76, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 62 -; SI-NEXT: s_or_b32 s4, s6, s5 -; SI-NEXT: s_and_b32 s5, s78, 0xff -; SI-NEXT: s_lshl_b32 s6, s74, 8 +; SI-NEXT: s_lshl_b32 s6, s47, 24 +; SI-NEXT: s_or_b32 s73, s6, s5 +; SI-NEXT: s_and_b32 s5, s57, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s16, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s17, 24 -; SI-NEXT: v_writelane_b32 v43, s4, 63 -; SI-NEXT: s_or_b32 s4, s16, s6 +; SI-NEXT: s_or_b32 s25, s16, s6 ; SI-NEXT: s_and_b32 s6, s89, 0xff -; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s77, 24 -; SI-NEXT: v_writelane_b32 v42, s4, 0 -; SI-NEXT: s_or_b32 s6, s16, s6 -; SI-NEXT: v_writelane_b32 v42, s6, 1 +; SI-NEXT: s_or_b32 s4, s16, s6 ; SI-NEXT: s_and_b32 s6, s18, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s19, 24 -; SI-NEXT: s_or_b32 s76, s16, s6 +; SI-NEXT: s_or_b32 s23, s16, s6 ; SI-NEXT: s_and_b32 s6, s93, 0xff ; SI-NEXT: s_lshl_b32 s16, s92, 8 ; SI-NEXT: s_or_b32 s6, s6, s16 ; SI-NEXT: s_and_b32 s16, s90, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s91, 24 -; SI-NEXT: s_or_b32 s77, s17, s16 +; SI-NEXT: s_or_b32 s76, s17, s16 ; SI-NEXT: s_and_b32 s16, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, vcc_lo, 24 -; SI-NEXT: s_or_b32 s25, s17, s16 +; SI-NEXT: s_or_b32 s22, s17, s16 ; SI-NEXT: s_and_b32 s16, s94, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s17, s95, 24 -; SI-NEXT: s_or_b32 s74, s17, s16 +; SI-NEXT: s_or_b32 s88, s17, s16 ; SI-NEXT: s_and_b32 s16, s35, 0xff ; SI-NEXT: s_lshl_b32 s17, s34, 8 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s17, s30, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s31, 24 -; SI-NEXT: s_or_b32 s78, s18, s17 +; SI-NEXT: s_or_b32 s74, s18, s17 ; SI-NEXT: s_and_b32 s17, s39, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s38, 24 -; SI-NEXT: s_mov_b32 s31, s88 -; SI-NEXT: s_or_b32 s88, s18, s17 +; SI-NEXT: s_or_b32 s77, s18, s17 ; SI-NEXT: s_and_b32 s17, s36, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s37, 24 -; SI-NEXT: s_or_b32 s89, s18, s17 +; SI-NEXT: s_or_b32 s79, s18, s17 ; SI-NEXT: s_and_b32 s17, s51, 0xff ; SI-NEXT: s_lshl_b32 s18, s50, 8 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s18, s48, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s49, 24 -; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_writelane_b32 v43, s18, 49 +; SI-NEXT: s_or_b32 s89, s19, s18 ; SI-NEXT: s_and_b32 s18, s55, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s54, 24 -; SI-NEXT: s_mov_b32 s73, s79 -; SI-NEXT: s_or_b32 s79, s19, s18 +; SI-NEXT: s_or_b32 s78, s19, s18 ; SI-NEXT: s_and_b32 s18, s52, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_lshl_b32 s19, s53, 24 -; SI-NEXT: s_or_b32 s94, s19, s18 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_writelane_b32 v42, s18, 63 ; SI-NEXT: s_and_b32 s18, s84, 0xff ; SI-NEXT: s_lshl_b32 s19, s67, 8 ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_and_b32 s19, s64, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s65, 24 -; SI-NEXT: s_or_b32 s95, s20, s19 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s19, 0 ; SI-NEXT: s_and_b32 s19, s12, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s8, 24 @@ -155261,49 +155269,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s85, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s86, 24 -; SI-NEXT: s_or_b32 s12, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s8, 2 +; SI-NEXT: s_or_b32 s8, s20, s19 ; SI-NEXT: s_and_b32 s19, s80, 0xff ; SI-NEXT: s_lshl_b32 s20, s9, 8 ; SI-NEXT: s_or_b32 vcc_lo, s19, s20 ; SI-NEXT: s_and_b32 s19, s44, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s97, 24 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s8, 1 +; SI-NEXT: s_or_b32 s8, s20, s19 ; SI-NEXT: s_and_b32 s19, s41, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s7, 24 ; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s96, 0xff +; SI-NEXT: v_writelane_b32 v43, s8, 3 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s15, 24 -; SI-NEXT: v_writelane_b32 v43, s12, 50 -; SI-NEXT: s_or_b32 s12, s20, s19 -; SI-NEXT: s_and_b32 s19, s26, 0xff +; SI-NEXT: v_writelane_b32 v43, s7, 5 +; SI-NEXT: s_or_b32 s7, s20, s19 +; SI-NEXT: s_and_b32 s19, s75, 0xff ; SI-NEXT: s_lshl_b32 s20, s82, 8 ; SI-NEXT: s_or_b32 vcc_hi, s19, s20 ; SI-NEXT: s_and_b32 s19, s99, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s87, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 4 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s72, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s81, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 52 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 6 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s98, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s83, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 54 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s62, 0xff -; SI-NEXT: s_lshl_b32 s20, s60, 8 +; SI-NEXT: s_or_b32 s60, s20, s19 +; SI-NEXT: s_and_b32 s19, s63, 0xff +; SI-NEXT: s_lshl_b32 s20, s59, 8 ; SI-NEXT: s_or_b32 s84, s19, s20 ; SI-NEXT: s_and_b32 s19, s71, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s70, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 53 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: v_writelane_b32 v43, s7, 7 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s11, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s68, 24 @@ -155311,185 +155321,185 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s14, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s69, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 55 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s29, 0xff +; SI-NEXT: s_or_b32 s58, s20, s19 +; SI-NEXT: s_and_b32 s19, s56, 0xff ; SI-NEXT: s_lshl_b32 s20, s66, 8 ; SI-NEXT: s_or_b32 s85, s19, s20 ; SI-NEXT: s_and_b32 s19, s10, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s58, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 56 -; SI-NEXT: s_or_b32 s9, s20, s19 +; SI-NEXT: s_lshl_b32 s20, s13, 24 +; SI-NEXT: v_writelane_b32 v43, s7, 8 +; SI-NEXT: s_or_b32 s7, s20, s19 ; SI-NEXT: s_and_b32 s19, s27, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_lshl_b32 s20, s28, 24 -; SI-NEXT: v_writelane_b32 v43, s9, 57 -; SI-NEXT: s_or_b32 s23, s20, s19 -; SI-NEXT: s_and_b32 s19, s24, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 33 +; SI-NEXT: s_or_b32 s47, s20, s19 +; SI-NEXT: s_and_b32 s19, s62, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 32 -; SI-NEXT: s_or_b32 s10, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 31 -; SI-NEXT: s_lshl_b32 s20, s9, 8 -; SI-NEXT: v_readlane_b32 s9, v44, 30 +; SI-NEXT: s_lshl_b32 s20, s61, 24 +; SI-NEXT: s_or_b32 s56, s20, s19 +; SI-NEXT: s_and_b32 s19, s46, 0xff +; SI-NEXT: s_lshl_b32 s20, s26, 8 +; SI-NEXT: v_writelane_b32 v43, s7, 9 ; SI-NEXT: s_or_b32 s86, s19, s20 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 29 +; SI-NEXT: s_and_b32 s19, s21, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 43 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s9, v44, 28 -; SI-NEXT: s_or_b32 s47, s20, s19 -; SI-NEXT: s_and_b32 s19, s9, 0xff -; SI-NEXT: v_readlane_b32 s9, v44, 27 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 42 +; SI-NEXT: s_or_b32 s61, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 41 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s9, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 26 -; SI-NEXT: s_or_b32 s9, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 25 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 40 +; SI-NEXT: s_or_b32 s46, s20, s19 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 39 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 24 -; SI-NEXT: s_or_b32 s24, s20, s19 -; SI-NEXT: s_mov_b32 s92, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 23 -; SI-NEXT: s_mov_b32 s36, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 8 -; SI-NEXT: v_readlane_b32 s11, v44, 22 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 38 +; SI-NEXT: s_or_b32 s62, s20, s19 +; SI-NEXT: s_mov_b32 s92, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 37 +; SI-NEXT: s_mov_b32 s37, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 36 ; SI-NEXT: s_or_b32 s87, s19, s20 -; SI-NEXT: s_mov_b32 s62, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 21 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: s_mov_b32 s30, s7 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s30, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 20 -; SI-NEXT: s_or_b32 s58, s20, s19 -; SI-NEXT: s_mov_b32 s91, s11 -; SI-NEXT: s_and_b32 s19, s11, 0xff -; SI-NEXT: v_readlane_b32 s11, v44, 19 +; SI-NEXT: s_lshl_b32 s20, s43, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 35 +; SI-NEXT: s_or_b32 s64, s20, s19 +; SI-NEXT: s_mov_b32 s91, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 34 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_mov_b32 s35, s11 -; SI-NEXT: s_lshl_b32 s20, s11, 24 -; SI-NEXT: v_readlane_b32 s11, v44, 18 -; SI-NEXT: s_mov_b32 s4, s46 -; SI-NEXT: s_or_b32 s46, s20, s19 -; SI-NEXT: s_and_b32 s19, s11, 0xff +; SI-NEXT: s_mov_b32 s36, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: s_mov_b32 s39, s43 +; SI-NEXT: s_or_b32 s43, s20, s19 +; SI-NEXT: s_mov_b32 s53, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 32 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s73, 24 -; SI-NEXT: s_mov_b32 s52, s73 -; SI-NEXT: s_or_b32 s73, s20, s19 -; SI-NEXT: s_and_b32 s19, s31, 0xff -; SI-NEXT: s_lshl_b32 s20, s45, 8 +; SI-NEXT: s_mov_b32 s49, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 31 +; SI-NEXT: s_or_b32 s65, s20, s19 +; SI-NEXT: s_mov_b32 s90, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 30 +; SI-NEXT: s_mov_b32 s54, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 29 ; SI-NEXT: s_or_b32 s26, s19, s20 -; SI-NEXT: s_and_b32 s19, s13, 0xff +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 28 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s42, 24 +; SI-NEXT: s_mov_b32 s50, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 27 ; SI-NEXT: s_or_b32 s67, s20, s19 -; SI-NEXT: s_and_b32 s19, s4, 0xff +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 26 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s43, 24 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_or_b32 s42, s20, s19 -; SI-NEXT: s_and_b32 s19, s56, 0xff +; SI-NEXT: s_mov_b32 s38, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 25 +; SI-NEXT: s_or_b32 s66, s20, s19 +; SI-NEXT: s_mov_b32 s48, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s59, 24 +; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 23 ; SI-NEXT: s_or_b32 s68, s20, s19 -; SI-NEXT: s_and_b32 s19, s63, 0xff -; SI-NEXT: s_lshl_b32 s20, s61, 8 -; SI-NEXT: v_readlane_b32 s93, v44, 17 +; SI-NEXT: s_mov_b32 s63, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 22 +; SI-NEXT: s_mov_b32 s52, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 21 ; SI-NEXT: s_or_b32 s27, s19, s20 -; SI-NEXT: s_and_b32 s19, s40, 0xff +; SI-NEXT: s_mov_b32 s51, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 20 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s93, 24 +; SI-NEXT: s_mov_b32 s55, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 19 ; SI-NEXT: s_or_b32 s70, s20, s19 -; SI-NEXT: s_and_b32 s19, s21, 0xff -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s59, s7 +; SI-NEXT: s_mov_b32 s93, s7 +; SI-NEXT: s_and_b32 s19, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v44, 18 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s75, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 16 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: s_mov_b32 s56, s10 +; SI-NEXT: s_mov_b32 s75, s7 +; SI-NEXT: s_lshl_b32 s20, s7, 24 +; SI-NEXT: v_readlane_b32 s7, v44, 17 ; SI-NEXT: s_or_b32 s69, s20, s19 ; SI-NEXT: s_mov_b32 s10, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 15 +; SI-NEXT: v_readlane_b32 s7, v44, 16 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s71, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 14 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: s_mov_b32 s75, s94 +; SI-NEXT: v_readlane_b32 s7, v44, 15 ; SI-NEXT: s_or_b32 s94, s20, s19 ; SI-NEXT: s_mov_b32 s41, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 13 +; SI-NEXT: v_readlane_b32 s7, v44, 14 ; SI-NEXT: s_mov_b32 s14, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 12 +; SI-NEXT: v_readlane_b32 s7, v44, 13 ; SI-NEXT: s_or_b32 s29, s19, s20 ; SI-NEXT: s_mov_b32 s81, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 11 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: s_mov_b32 s45, s9 +; SI-NEXT: v_readlane_b32 s7, v44, 12 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 10 -; SI-NEXT: s_mov_b32 s38, s11 +; SI-NEXT: v_readlane_b32 s7, v44, 11 ; SI-NEXT: s_or_b32 s11, s20, s19 ; SI-NEXT: s_mov_b32 s72, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 9 +; SI-NEXT: v_readlane_b32 s7, v44, 10 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s82, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 8 +; SI-NEXT: v_readlane_b32 s7, v44, 9 ; SI-NEXT: s_or_b32 s80, s20, s19 ; SI-NEXT: s_mov_b32 s83, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 7 +; SI-NEXT: v_readlane_b32 s7, v44, 8 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_mov_b32 s96, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 6 -; SI-NEXT: s_mov_b32 s90, s31 +; SI-NEXT: v_readlane_b32 s7, v44, 7 ; SI-NEXT: s_or_b32 s31, s20, s19 ; SI-NEXT: s_mov_b32 s98, s7 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v44, 5 +; SI-NEXT: v_readlane_b32 s7, v44, 6 ; SI-NEXT: s_mov_b32 s44, s7 ; SI-NEXT: s_lshl_b32 s20, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 4 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s43, s93 -; SI-NEXT: s_mov_b32 s93, s21 +; SI-NEXT: v_readlane_b32 s7, v44, 5 ; SI-NEXT: s_or_b32 s21, s19, s20 ; SI-NEXT: s_and_b32 s19, s7, 0xff -; SI-NEXT: s_mov_b32 s34, s4 +; SI-NEXT: v_readlane_b32 s8, v44, 4 ; SI-NEXT: s_lshl_b32 s19, s19, 16 -; SI-NEXT: s_lshl_b32 s20, s22, 24 -; SI-NEXT: v_readlane_b32 s4, v43, 60 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: s_mov_b32 s13, s12 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s63, s95 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s61, s8 -; SI-NEXT: s_mov_b32 s60, s40 +; SI-NEXT: s_lshl_b32 s20, s8, 24 ; SI-NEXT: s_mov_b32 s12, s7 -; SI-NEXT: s_mov_b32 s7, s22 +; SI-NEXT: s_mov_b32 s7, s8 ; SI-NEXT: s_or_b32 s15, s20, s19 -; SI-NEXT: s_lshl_b32 s20, s4, 16 -; SI-NEXT: s_lshl_b32 s95, s5, 16 -; SI-NEXT: s_lshl_b32 s22, s6, 16 +; SI-NEXT: s_lshl_b32 s20, s24, 16 +; SI-NEXT: s_lshl_b32 s35, s5, 16 +; SI-NEXT: s_lshl_b32 s95, s6, 16 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_lshl_b32 s19, s17, 16 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -155500,16 +155510,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s97, s86, 16 ; SI-NEXT: s_lshl_b32 s28, s87, 16 ; SI-NEXT: s_lshl_b32 s87, s26, 16 -; SI-NEXT: v_readlane_b32 s26, v43, 58 +; SI-NEXT: v_readlane_b32 s26, v43, 10 ; SI-NEXT: s_lshl_b32 s86, s27, 16 -; SI-NEXT: v_readlane_b32 s27, v43, 59 -; SI-NEXT: v_readlane_b32 s66, v43, 63 +; SI-NEXT: s_mov_b32 s27, s45 ; SI-NEXT: s_lshl_b32 s85, s29, 16 -; SI-NEXT: v_readlane_b32 s29, v43, 62 -; SI-NEXT: v_readlane_b32 s65, v43, 61 -; SI-NEXT: v_readlane_b32 s64, v42, 0 +; SI-NEXT: s_mov_b32 s29, s40 +; SI-NEXT: s_mov_b32 s24, s42 ; SI-NEXT: s_lshl_b32 s84, s21, 16 -; SI-NEXT: v_readlane_b32 s21, v42, 1 +; SI-NEXT: s_mov_b32 s21, s4 ; SI-NEXT: s_cbranch_execnz .LBB89_3 ; SI-NEXT: .LBB89_2: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s98, 3 @@ -155550,7 +155558,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s6, s16, s6 ; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s17, s39, 8 +; SI-NEXT: s_lshl_b32 s17, s75, 8 ; SI-NEXT: s_add_i32 s18, s10, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_and_b32 s18, s18, 0xff @@ -155560,13 +155568,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: s_add_i32 s17, s50, 3 +; SI-NEXT: s_add_i32 s17, s63, 3 ; SI-NEXT: s_and_b32 s17, s17, 0xff -; SI-NEXT: s_lshl_b32 s18, s49, 8 -; SI-NEXT: s_add_i32 s19, s60, 3 +; SI-NEXT: s_lshl_b32 s18, s52, 8 +; SI-NEXT: s_add_i32 s19, s51, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s18, s43, 24 +; SI-NEXT: s_lshl_b32 s18, s55, 24 ; SI-NEXT: s_lshl_b32 s19, s19, 16 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_or_b32 s18, s18, s19 @@ -155574,11 +155582,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_add_i32 s18, s34, 3 ; SI-NEXT: s_and_b32 s18, s18, 0xff -; SI-NEXT: s_lshl_b32 s19, s37, 8 +; SI-NEXT: s_lshl_b32 s19, s38, 8 ; SI-NEXT: s_add_i32 s20, s48, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s19, s51, 24 +; SI-NEXT: s_lshl_b32 s19, s59, 24 ; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_or_b32 s19, s19, s20 @@ -155586,11 +155594,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: s_add_i32 s19, s90, 3 ; SI-NEXT: s_and_b32 s19, s19, 0xff -; SI-NEXT: s_lshl_b32 s20, s55, 8 -; SI-NEXT: s_add_i32 s22, s54, 3 +; SI-NEXT: s_lshl_b32 s20, s54, 8 +; SI-NEXT: s_add_i32 s22, s13, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s20, s53, 24 +; SI-NEXT: s_lshl_b32 s20, s50, 24 ; SI-NEXT: s_lshl_b32 s22, s22, 16 ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_or_b32 s20, s20, s22 @@ -155598,11 +155606,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s19, s20, s19 ; SI-NEXT: s_add_i32 s20, s91, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s22, s35, 8 -; SI-NEXT: s_add_i32 s23, s38, 3 +; SI-NEXT: s_lshl_b32 s22, s36, 8 +; SI-NEXT: s_add_i32 s23, s53, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_and_b32 s23, s23, 0xff -; SI-NEXT: s_lshl_b32 s22, s52, 24 +; SI-NEXT: s_lshl_b32 s22, s49, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_or_b32 s22, s22, s23 @@ -155610,93 +155618,93 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s92, 3 ; SI-NEXT: s_and_b32 s22, s22, 0xff -; SI-NEXT: s_lshl_b32 s23, s36, 8 -; SI-NEXT: s_add_i32 s60, s62, 3 +; SI-NEXT: s_lshl_b32 s23, s37, 8 +; SI-NEXT: s_add_i32 s60, s30, 3 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_and_b32 s60, s60, 0xff -; SI-NEXT: s_lshl_b32 s23, s30, 24 +; SI-NEXT: s_lshl_b32 s23, s39, 24 ; SI-NEXT: s_lshl_b32 s60, s60, 16 ; SI-NEXT: s_addk_i32 s22, 0x300 ; SI-NEXT: s_or_b32 s23, s23, s60 ; SI-NEXT: s_and_b32 s22, s22, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 28 +; SI-NEXT: v_readlane_b32 s7, v44, 42 ; SI-NEXT: s_or_b32 s22, s23, s22 ; SI-NEXT: s_add_i32 s23, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 27 +; SI-NEXT: v_readlane_b32 s7, v44, 41 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s60, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 25 +; SI-NEXT: v_readlane_b32 s7, v44, 39 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_lshl_b32 s60, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 26 +; SI-NEXT: v_readlane_b32 s7, v44, 40 ; SI-NEXT: s_add_i32 s61, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_lshl_b32 s61, s61, 16 ; SI-NEXT: s_addk_i32 s23, 0x300 ; SI-NEXT: s_or_b32 s60, s60, s61 ; SI-NEXT: s_and_b32 s23, s23, 0xffff -; SI-NEXT: v_readlane_b32 s7, v44, 32 +; SI-NEXT: v_readlane_b32 s7, v44, 58 ; SI-NEXT: s_or_b32 s23, s60, s23 ; SI-NEXT: s_add_i32 s60, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v44, 31 +; SI-NEXT: v_readlane_b32 s7, v44, 57 ; SI-NEXT: s_and_b32 s60, s60, 0xff ; SI-NEXT: s_lshl_b32 s61, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 29 +; SI-NEXT: v_readlane_b32 s7, v44, 43 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_lshl_b32 s61, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 30 +; SI-NEXT: v_readlane_b32 s7, v44, 56 ; SI-NEXT: s_add_i32 s62, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 48 +; SI-NEXT: v_readlane_b32 s7, v42, 62 ; SI-NEXT: s_and_b32 s62, s62, 0xff ; SI-NEXT: s_add_i32 s59, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 47 +; SI-NEXT: v_readlane_b32 s7, v42, 61 ; SI-NEXT: s_lshl_b32 s62, s62, 16 ; SI-NEXT: s_addk_i32 s60, 0x300 ; SI-NEXT: s_and_b32 s59, s59, 0xff ; SI-NEXT: s_lshl_b32 s58, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v44, 33 +; SI-NEXT: v_readlane_b32 s7, v44, 55 ; SI-NEXT: s_or_b32 s61, s61, s62 ; SI-NEXT: s_and_b32 s60, s60, 0xffff ; SI-NEXT: s_or_b32 s58, s58, s59 ; SI-NEXT: s_lshl_b32 s59, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v44, 44 +; SI-NEXT: v_readlane_b32 s7, v44, 54 ; SI-NEXT: s_or_b32 s60, s61, s60 ; SI-NEXT: s_add_i32 s61, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 46 +; SI-NEXT: v_readlane_b32 s7, v42, 60 ; SI-NEXT: s_add_i32 s57, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 45 +; SI-NEXT: v_readlane_b32 s7, v42, 59 ; SI-NEXT: s_lshl_b32 s56, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 44 +; SI-NEXT: v_readlane_b32 s7, v42, 58 ; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 43 +; SI-NEXT: v_readlane_b32 s7, v42, 57 ; SI-NEXT: s_add_i32 s46, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 42 +; SI-NEXT: v_readlane_b32 s7, v42, 56 ; SI-NEXT: s_add_i32 s45, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: v_readlane_b32 s7, v42, 53 ; SI-NEXT: s_lshl_b32 s42, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 36 +; SI-NEXT: v_readlane_b32 s7, v42, 50 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 35 +; SI-NEXT: v_readlane_b32 s7, v42, 49 ; SI-NEXT: s_and_b32 s45, s45, 0xff ; SI-NEXT: s_add_i32 s14, s7, 3 ; SI-NEXT: s_or_b32 s42, s42, s45 ; SI-NEXT: s_and_b32 s14, s14, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s42, 0x300 -; SI-NEXT: v_readlane_b32 s7, v43, 41 +; SI-NEXT: v_readlane_b32 s7, v42, 55 ; SI-NEXT: s_and_b32 s57, s57, 0xff ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s15, s42, 0xffff ; SI-NEXT: s_add_i32 s44, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 40 +; SI-NEXT: v_readlane_b32 s7, v42, 54 ; SI-NEXT: s_or_b32 s56, s56, s57 ; SI-NEXT: s_or_b32 s57, s14, s15 ; SI-NEXT: s_and_b32 s14, s44, 0xff ; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 38 +; SI-NEXT: v_readlane_b32 s7, v42, 52 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 37 +; SI-NEXT: v_readlane_b32 s7, v42, 51 ; SI-NEXT: s_add_i32 s40, s7, 3 ; SI-NEXT: s_and_b32 s61, s61, 0xff ; SI-NEXT: s_and_b32 s40, s40, 0xff @@ -155711,15 +155719,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s58, s59, s58 ; SI-NEXT: s_or_b32 s59, s15, s14 ; SI-NEXT: s_add_i32 s14, s6, 0x3000000 -; SI-NEXT: v_readlane_b32 s6, v43, 32 +; SI-NEXT: v_readlane_b32 s6, v42, 46 ; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 29 +; SI-NEXT: v_readlane_b32 s7, v42, 43 ; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s8, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 26 +; SI-NEXT: v_readlane_b32 s7, v42, 40 ; SI-NEXT: s_or_b32 s6, s8, s6 ; SI-NEXT: s_lshl_b32 s8, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 25 +; SI-NEXT: v_readlane_b32 s7, v42, 39 ; SI-NEXT: s_add_i32 s24, s7, 3 ; SI-NEXT: s_and_b32 s11, s24, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 @@ -155727,47 +155735,47 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s8, s8, s11 ; SI-NEXT: s_or_b32 s8, s8, s6 -; SI-NEXT: v_readlane_b32 s6, v43, 33 +; SI-NEXT: v_readlane_b32 s6, v42, 47 ; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 27 +; SI-NEXT: v_readlane_b32 s7, v42, 41 ; SI-NEXT: s_and_b32 s6, s12, 0xff ; SI-NEXT: s_lshl_b32 s11, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 17 +; SI-NEXT: v_readlane_b32 s7, v42, 31 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 19 +; SI-NEXT: v_readlane_b32 s7, v42, 33 ; SI-NEXT: s_add_i32 s12, s7, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: v_readlane_b32 s7, v43, 34 +; SI-NEXT: v_readlane_b32 s7, v42, 48 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_add_i32 s13, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 31 +; SI-NEXT: v_readlane_b32 s7, v42, 45 ; SI-NEXT: s_or_b32 s6, s11, s6 ; SI-NEXT: s_and_b32 s11, s13, 0xff ; SI-NEXT: s_lshl_b32 s10, s7, 8 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v42, 37 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_lshl_b32 s11, s7, 24 -; SI-NEXT: v_readlane_b32 s7, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v42, 38 ; SI-NEXT: s_add_i32 s25, s7, 3 ; SI-NEXT: s_and_b32 s12, s25, 0xff ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: v_readlane_b32 s7, v43, 30 +; SI-NEXT: v_readlane_b32 s7, v42, 44 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s9, s7, 3 -; SI-NEXT: v_readlane_b32 s7, v43, 28 -; SI-NEXT: v_readlane_b32 s11, v43, 21 +; SI-NEXT: v_readlane_b32 s7, v42, 42 +; SI-NEXT: v_readlane_b32 s11, v42, 35 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: v_readlane_b32 s9, v43, 22 +; SI-NEXT: v_readlane_b32 s9, v42, 36 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_addk_i32 s7, 0x300 ; SI-NEXT: s_lshl_b32 s9, s9, 24 @@ -155775,15 +155783,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s9, s9, s11 ; SI-NEXT: s_or_b32 s7, s9, s7 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s9, v42, 34 ; SI-NEXT: s_add_i32 s21, s9, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 18 -; SI-NEXT: v_readlane_b32 s12, v43, 15 +; SI-NEXT: v_readlane_b32 s11, v42, 32 +; SI-NEXT: v_readlane_b32 s12, v42, 29 ; SI-NEXT: s_and_b32 s9, s21, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 16 +; SI-NEXT: v_readlane_b32 s11, v42, 30 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_addk_i32 s9, 0x300 ; SI-NEXT: s_lshl_b32 s11, s11, 24 @@ -155791,15 +155799,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: v_readlane_b32 s11, v43, 14 +; SI-NEXT: v_readlane_b32 s11, v42, 28 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_readlane_b32 s12, v43, 13 -; SI-NEXT: v_readlane_b32 s13, v43, 11 +; SI-NEXT: v_readlane_b32 s12, v42, 27 +; SI-NEXT: v_readlane_b32 s13, v42, 25 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 8 ; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 12 +; SI-NEXT: v_readlane_b32 s12, v42, 26 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_addk_i32 s11, 0x300 ; SI-NEXT: s_lshl_b32 s12, s12, 24 @@ -155807,16 +155815,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: v_readlane_b32 s12, v43, 10 +; SI-NEXT: v_readlane_b32 s12, v42, 24 ; SI-NEXT: s_add_i32 s15, s16, 0x3000000 ; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: v_readlane_b32 s13, v43, 9 -; SI-NEXT: v_readlane_b32 s16, v43, 7 +; SI-NEXT: v_readlane_b32 s13, v42, 23 +; SI-NEXT: v_readlane_b32 s16, v42, 21 ; SI-NEXT: s_and_b32 s12, s12, 0xff ; SI-NEXT: s_lshl_b32 s13, s13, 8 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 8 +; SI-NEXT: v_readlane_b32 s13, v42, 22 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_lshl_b32 s13, s13, 24 @@ -155824,16 +155832,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s13, s13, s16 ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 6 +; SI-NEXT: v_readlane_b32 s13, v42, 20 ; SI-NEXT: s_add_i32 s40, s17, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 5 -; SI-NEXT: v_readlane_b32 s17, v43, 3 +; SI-NEXT: v_readlane_b32 s16, v42, 19 +; SI-NEXT: v_readlane_b32 s17, v42, 17 ; SI-NEXT: s_and_b32 s13, s13, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 18 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_addk_i32 s13, 0x300 ; SI-NEXT: s_lshl_b32 s16, s16, 24 @@ -155841,16 +155849,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_or_b32 s13, s16, s13 -; SI-NEXT: v_readlane_b32 s16, v43, 2 +; SI-NEXT: v_readlane_b32 s16, v42, 16 ; SI-NEXT: s_add_i32 s41, s18, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 1 -; SI-NEXT: v_readlane_b32 s18, v44, 63 +; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: v_readlane_b32 s18, v42, 13 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s17, v42, 14 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -155859,16 +155867,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s17, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v44, 62 +; SI-NEXT: v_readlane_b32 s16, v42, 12 ; SI-NEXT: s_add_i32 s42, s19, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s18, v44, 61 -; SI-NEXT: v_readlane_b32 s19, v44, 59 +; SI-NEXT: v_readlane_b32 s18, v42, 11 +; SI-NEXT: v_readlane_b32 s19, v42, 9 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 60 +; SI-NEXT: v_readlane_b32 s18, v42, 10 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s18, s18, 24 @@ -155876,16 +155884,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: s_or_b32 s16, s18, s16 -; SI-NEXT: v_readlane_b32 s18, v44, 58 +; SI-NEXT: v_readlane_b32 s18, v42, 8 ; SI-NEXT: s_add_i32 s43, s20, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_readlane_b32 s19, v44, 57 -; SI-NEXT: v_readlane_b32 s20, v44, 55 +; SI-NEXT: v_readlane_b32 s19, v42, 7 +; SI-NEXT: v_readlane_b32 s20, v42, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 56 +; SI-NEXT: v_readlane_b32 s19, v42, 6 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_addk_i32 s18, 0x300 ; SI-NEXT: s_lshl_b32 s19, s19, 24 @@ -155893,15 +155901,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: s_or_b32 s19, s19, s20 ; SI-NEXT: s_or_b32 s18, s19, s18 -; SI-NEXT: v_readlane_b32 s19, v44, 54 +; SI-NEXT: v_readlane_b32 s19, v42, 4 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_readlane_b32 s20, v44, 53 -; SI-NEXT: v_readlane_b32 s21, v44, 51 +; SI-NEXT: v_readlane_b32 s20, v42, 3 +; SI-NEXT: v_readlane_b32 s21, v42, 1 ; SI-NEXT: s_and_b32 s19, s19, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 ; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 52 +; SI-NEXT: v_readlane_b32 s20, v42, 2 ; SI-NEXT: s_and_b32 s21, s21, 0xff ; SI-NEXT: s_addk_i32 s19, 0x300 ; SI-NEXT: s_lshl_b32 s20, s20, 24 @@ -155909,16 +155917,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_and_b32 s19, s19, 0xffff ; SI-NEXT: s_or_b32 s20, s20, s21 ; SI-NEXT: s_or_b32 s19, s20, s19 -; SI-NEXT: v_readlane_b32 s20, v44, 50 +; SI-NEXT: v_readlane_b32 s20, v42, 0 ; SI-NEXT: s_add_i32 s44, s22, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s21, v44, 49 -; SI-NEXT: v_readlane_b32 s22, v44, 47 +; SI-NEXT: v_readlane_b32 s21, v44, 63 +; SI-NEXT: v_readlane_b32 s22, v44, 61 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s21, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s20, s21, s20 -; SI-NEXT: v_readlane_b32 s21, v44, 48 +; SI-NEXT: v_readlane_b32 s21, v44, 62 ; SI-NEXT: s_and_b32 s22, s22, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s21, s21, 24 @@ -155927,16 +155935,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s21, s21, s22 ; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: s_add_i32 s21, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 43 +; SI-NEXT: v_readlane_b32 s20, v44, 53 ; SI-NEXT: s_add_i32 s45, s23, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s22, v44, 42 -; SI-NEXT: v_readlane_b32 s23, v44, 45 +; SI-NEXT: v_readlane_b32 s22, v44, 52 +; SI-NEXT: v_readlane_b32 s23, v44, 59 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s22, s22, 8 ; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_or_b32 s20, s22, s20 -; SI-NEXT: v_readlane_b32 s22, v44, 46 +; SI-NEXT: v_readlane_b32 s22, v44, 60 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s22, s22, 24 @@ -155945,15 +155953,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: s_or_b32 s20, s22, s20 ; SI-NEXT: s_add_i32 s22, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 41 +; SI-NEXT: v_readlane_b32 s20, v44, 51 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s23, v44, 40 -; SI-NEXT: v_readlane_b32 s24, v44, 38 +; SI-NEXT: v_readlane_b32 s23, v44, 50 +; SI-NEXT: v_readlane_b32 s24, v44, 48 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s20, s23, s20 -; SI-NEXT: v_readlane_b32 s23, v44, 39 +; SI-NEXT: v_readlane_b32 s23, v44, 49 ; SI-NEXT: s_and_b32 s24, s24, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s23, s23, 24 @@ -155962,15 +155970,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s23, s23, s24 ; SI-NEXT: s_or_b32 s20, s23, s20 ; SI-NEXT: s_add_i32 s23, s20, 0x3000000 -; SI-NEXT: v_readlane_b32 s20, v44, 37 +; SI-NEXT: v_readlane_b32 s20, v44, 47 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v44, 36 -; SI-NEXT: v_readlane_b32 s25, v44, 34 +; SI-NEXT: v_readlane_b32 s24, v44, 46 +; SI-NEXT: v_readlane_b32 s25, v44, 44 ; SI-NEXT: s_and_b32 s20, s20, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 ; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v44, 35 +; SI-NEXT: v_readlane_b32 s24, v44, 45 ; SI-NEXT: s_and_b32 s25, s25, 0xff ; SI-NEXT: s_addk_i32 s20, 0x300 ; SI-NEXT: s_lshl_b32 s24, s24, 24 @@ -155988,100 +155996,101 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_or_b32 s24, s25, s24 ; SI-NEXT: v_readlane_b32 s25, v44, 0 ; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 ; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_addk_i32 s24, 0x300 ; SI-NEXT: s_lshl_b32 s25, s25, 24 ; SI-NEXT: s_lshl_b32 s26, s26, 16 ; SI-NEXT: s_add_i32 s9, s9, 0x3000000 -; SI-NEXT: s_add_i32 s11, s11, 0x3000000 ; SI-NEXT: s_add_i32 s18, s18, 0x3000000 ; SI-NEXT: s_and_b32 s24, s24, 0xffff ; SI-NEXT: s_or_b32 s25, s25, s26 -; SI-NEXT: s_and_b32 s89, s17, 0xffff0000 -; SI-NEXT: s_lshl_b32 s88, s17, 16 -; SI-NEXT: s_and_b32 s17, s13, 0xffff0000 -; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_and_b32 s76, s19, 0xffff0000 +; SI-NEXT: s_lshl_b32 s95, s19, 16 +; SI-NEXT: s_and_b32 s89, s13, 0xffff0000 +; SI-NEXT: s_lshl_b32 s19, s13, 16 +; SI-NEXT: s_and_b32 s13, s12, 0xffff0000 +; SI-NEXT: s_lshl_b32 s78, s12, 16 +; SI-NEXT: s_and_b32 s12, s11, 0xffff0000 ; SI-NEXT: s_or_b32 s24, s25, s24 -; SI-NEXT: s_and_b32 s74, s18, 0xffff0000 -; SI-NEXT: s_lshl_b32 s25, s18, 16 -; SI-NEXT: v_writelane_b32 v43, s17, 49 -; SI-NEXT: s_and_b32 s63, s11, 0xffff0000 +; SI-NEXT: s_and_b32 s25, s22, 0xffff0000 +; SI-NEXT: s_lshl_b32 s35, s22, 16 +; SI-NEXT: s_and_b32 s88, s18, 0xffff0000 +; SI-NEXT: s_lshl_b32 s22, s18, 16 +; SI-NEXT: v_writelane_b32 v43, s12, 0 ; SI-NEXT: s_lshl_b32 s18, s11, 16 ; SI-NEXT: s_and_b32 s11, s9, 0xffff0000 -; SI-NEXT: s_and_b32 s46, s46, 0xff -; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s11, 50 -; SI-NEXT: s_lshl_b32 s61, s9, 16 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s11, 1 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s9, 2 ; SI-NEXT: s_and_b32 s9, s7, 0xffff0000 +; SI-NEXT: s_and_b32 s46, s46, 0xff +; SI-NEXT: s_and_b32 s79, s17, 0xffff0000 +; SI-NEXT: s_lshl_b32 s77, s17, 16 +; SI-NEXT: v_writelane_b32 v43, s9, 3 +; SI-NEXT: s_lshl_b32 s17, s7, 16 +; SI-NEXT: s_and_b32 s7, s10, 0xffff0000 ; SI-NEXT: s_lshl_b32 s46, s46, 16 ; SI-NEXT: s_addk_i32 s56, 0x300 -; SI-NEXT: s_add_i32 s8, s8, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s9, 51 -; SI-NEXT: s_lshl_b32 s17, s7, 16 -; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 4 +; SI-NEXT: s_lshl_b32 s7, s10, 16 ; SI-NEXT: s_or_b32 s46, s47, s46 ; SI-NEXT: s_and_b32 s47, s56, 0xffff -; SI-NEXT: v_writelane_b32 v43, s7, 52 -; SI-NEXT: s_and_b32 s7, s8, 0xffff0000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: v_writelane_b32 v43, s7, 5 +; SI-NEXT: s_and_b32 s7, s6, 0xffff0000 ; SI-NEXT: s_or_b32 s56, s46, s47 ; SI-NEXT: s_add_i32 s47, s58, 0x3000000 ; SI-NEXT: s_add_i32 s58, s59, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 53 +; SI-NEXT: v_writelane_b32 v43, s7, 6 ; SI-NEXT: s_lshl_b32 s7, s8, 16 -; SI-NEXT: s_add_i32 s57, s57, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 54 -; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s46, s60, 0x3000000 ; SI-NEXT: s_add_i32 s56, s56, 0x3000000 -; SI-NEXT: s_add_i32 s10, s10, 0x3000000 -; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s57, s57, 0x3000000 ; SI-NEXT: s_add_i32 s16, s16, 0x3000000 -; SI-NEXT: s_add_i32 s19, s19, 0x3000000 ; SI-NEXT: s_add_i32 s20, s20, 0x3000000 ; SI-NEXT: s_add_i32 s24, s24, 0x3000000 -; SI-NEXT: v_writelane_b32 v43, s7, 55 -; SI-NEXT: s_and_b32 s7, s57, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 7 +; SI-NEXT: s_and_b32 s7, s58, 0xffff0000 ; SI-NEXT: s_and_b32 s27, s24, 0xffff0000 ; SI-NEXT: s_lshl_b32 s26, s24, 16 -; SI-NEXT: s_and_b32 s65, s20, 0xffff0000 +; SI-NEXT: s_and_b32 s24, s20, 0xffff0000 ; SI-NEXT: s_lshl_b32 s20, s20, 16 -; SI-NEXT: s_and_b32 s66, s23, 0xffff0000 +; SI-NEXT: s_and_b32 s73, s23, 0xffff0000 ; SI-NEXT: s_lshl_b32 s29, s23, 16 -; SI-NEXT: s_and_b32 s64, s22, 0xffff0000 -; SI-NEXT: s_lshl_b32 s95, s22, 16 -; SI-NEXT: s_and_b32 s76, s21, 0xffff0000 +; SI-NEXT: s_and_b32 s23, s21, 0xffff0000 ; SI-NEXT: s_lshl_b32 s21, s21, 16 -; SI-NEXT: s_and_b32 s77, s19, 0xffff0000 -; SI-NEXT: s_lshl_b32 s22, s19, 16 -; SI-NEXT: s_and_b32 s78, s16, 0xffff0000 +; SI-NEXT: s_and_b32 s74, s16, 0xffff0000 ; SI-NEXT: s_lshl_b32 s16, s16, 16 -; SI-NEXT: s_lshl_b32 s19, s13, 16 -; SI-NEXT: s_and_b32 s75, s12, 0xffff0000 -; SI-NEXT: s_lshl_b32 s79, s12, 16 -; SI-NEXT: s_and_b32 s13, s10, 0xffff0000 -; SI-NEXT: s_lshl_b32 s59, s10, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_and_b32 s60, s8, 0xffff0000 +; SI-NEXT: v_writelane_b32 v43, s7, 8 ; SI-NEXT: s_lshl_b32 s99, s58, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 56 +; SI-NEXT: s_and_b32 s58, s57, 0xffff0000 ; SI-NEXT: s_lshl_b32 s57, s57, 16 ; SI-NEXT: s_and_b32 s7, s56, 0xffff0000 ; SI-NEXT: s_lshl_b32 s8, s56, 16 ; SI-NEXT: s_and_b32 s56, s47, 0xffff0000 -; SI-NEXT: s_lshl_b32 s23, s47, 16 -; SI-NEXT: s_and_b32 s47, s46, 0xffff0000 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_and_b32 s61, s46, 0xffff0000 ; SI-NEXT: s_lshl_b32 s97, s46, 16 -; SI-NEXT: s_and_b32 s24, s45, 0xffff0000 -; SI-NEXT: s_lshl_b32 s45, s45, 16 -; SI-NEXT: s_and_b32 s58, s44, 0xffff0000 +; SI-NEXT: s_and_b32 s62, s45, 0xffff0000 +; SI-NEXT: s_lshl_b32 s46, s45, 16 +; SI-NEXT: s_and_b32 s64, s44, 0xffff0000 ; SI-NEXT: s_lshl_b32 s28, s44, 16 -; SI-NEXT: s_and_b32 s73, s43, 0xffff0000 -; SI-NEXT: s_lshl_b32 s46, s43, 16 +; SI-NEXT: s_and_b32 s65, s43, 0xffff0000 +; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_and_b32 s67, s42, 0xffff0000 ; SI-NEXT: s_lshl_b32 s87, s42, 16 ; SI-NEXT: s_and_b32 s68, s41, 0xffff0000 -; SI-NEXT: s_lshl_b32 s42, s41, 16 +; SI-NEXT: s_lshl_b32 s66, s41, 16 ; SI-NEXT: s_and_b32 s70, s40, 0xffff0000 ; SI-NEXT: s_lshl_b32 s86, s40, 16 ; SI-NEXT: s_and_b32 s94, s15, 0xffff0000 @@ -156092,99 +156101,102 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_lshl_b32 s80, s5, 16 ; SI-NEXT: s_and_b32 s15, s4, 0xffff0000 ; SI-NEXT: s_lshl_b32 s84, s4, 16 -; SI-NEXT: v_writelane_b32 v43, s7, 57 +; SI-NEXT: v_writelane_b32 v42, s13, 63 +; SI-NEXT: v_writelane_b32 v43, s7, 9 ; SI-NEXT: .LBB89_3: ; %end ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s27 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s26 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 -; SI-NEXT: v_readlane_b32 s4, v43, 49 +; SI-NEXT: v_readlane_b32 s4, v42, 63 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s20 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s29 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s35 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s21 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s76 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s95 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s22 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s74 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s88 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s77 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s89 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s75 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s79 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s78 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s63 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s18 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 50 +; SI-NEXT: v_readlane_b32 s4, v43, 1 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 51 +; SI-NEXT: v_readlane_b32 s4, v43, 3 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156192,14 +156204,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_readlane_b32 s4, v43, 4 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s13 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s59 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 52 +; SI-NEXT: v_readlane_b32 s4, v43, 6 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156207,16 +156221,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s6 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 53 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; SI-NEXT: v_readlane_b32 s4, v43, 54 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s60 +; SI-NEXT: v_readlane_b32 s4, v43, 7 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 55 +; SI-NEXT: v_readlane_b32 s4, v43, 8 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156224,15 +156237,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s99 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 56 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s57 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: v_readlane_b32 s4, v43, 57 +; SI-NEXT: v_readlane_b32 s4, v43, 9 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4 @@ -156244,35 +156256,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s56 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s47 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s61 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s97 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s62 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s45 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s58 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s64 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s28 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s73 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s65 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s46 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s43 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156286,7 +156298,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s68 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s42 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s66 ; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -156371,52 +156383,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB89_4: +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: s_mov_b32 s7, s6 +; SI-NEXT: v_readlane_b32 s92, v44, 38 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: v_readlane_b32 s91, v44, 35 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr8 ; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s90, v44, 31 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s37, v44, 37 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; kill: killed $sgpr8 -; SI-NEXT: v_readlane_b32 s92, v44, 24 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: v_readlane_b32 s91, v44, 20 -; SI-NEXT: s_mov_b32 s90, s88 -; SI-NEXT: v_readlane_b32 s36, v44, 23 -; SI-NEXT: v_readlane_b32 s35, v44, 19 -; SI-NEXT: v_readlane_b32 s62, v44, 22 -; SI-NEXT: v_readlane_b32 s38, v44, 18 -; SI-NEXT: s_mov_b32 s34, s46 -; SI-NEXT: s_mov_b32 s93, s21 -; SI-NEXT: s_mov_b32 s37, s43 -; SI-NEXT: s_mov_b32 s39, s75 -; SI-NEXT: v_readlane_b32 s72, v44, 10 -; SI-NEXT: s_mov_b32 s50, s63 -; SI-NEXT: s_mov_b32 s51, s59 -; SI-NEXT: s_mov_b32 s48, s56 -; SI-NEXT: v_readlane_b32 s30, v44, 21 -; SI-NEXT: s_mov_b32 s49, s61 -; SI-NEXT: s_mov_b32 s52, s79 -; SI-NEXT: v_readlane_b32 s98, v44, 6 -; SI-NEXT: s_mov_b32 s55, s45 -; SI-NEXT: v_readlane_b32 s43, v44, 17 -; SI-NEXT: s_mov_b32 s60, s40 -; SI-NEXT: v_readlane_b32 s41, v44, 14 -; SI-NEXT: s_mov_b32 s53, s42 -; SI-NEXT: s_mov_b32 s54, s13 -; SI-NEXT: v_readlane_b32 s14, v44, 13 -; SI-NEXT: v_readlane_b32 s44, v44, 5 -; SI-NEXT: v_readlane_b32 s9, v44, 11 -; SI-NEXT: v_readlane_b32 s81, v44, 12 -; SI-NEXT: v_readlane_b32 s82, v44, 9 -; SI-NEXT: v_readlane_b32 s10, v44, 16 -; SI-NEXT: v_readlane_b32 s12, v44, 4 -; SI-NEXT: v_readlane_b32 s96, v44, 7 -; SI-NEXT: v_readlane_b32 s83, v44, 8 -; SI-NEXT: v_readlane_b32 s71, v44, 15 +; SI-NEXT: ; kill: killed $sgpr6 +; SI-NEXT: v_readlane_b32 s36, v44, 34 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: v_readlane_b32 s30, v44, 36 +; SI-NEXT: v_readlane_b32 s53, v44, 33 +; SI-NEXT: v_readlane_b32 s34, v44, 27 +; SI-NEXT: v_readlane_b32 s93, v44, 19 +; SI-NEXT: v_readlane_b32 s38, v44, 26 +; SI-NEXT: v_readlane_b32 s75, v44, 18 +; SI-NEXT: v_readlane_b32 s72, v44, 11 +; SI-NEXT: v_readlane_b32 s63, v44, 23 +; SI-NEXT: v_readlane_b32 s59, v44, 24 +; SI-NEXT: v_readlane_b32 s48, v44, 25 +; SI-NEXT: s_mov_b32 s39, s43 +; SI-NEXT: v_readlane_b32 s52, v44, 22 +; SI-NEXT: v_readlane_b32 s49, v44, 32 +; SI-NEXT: v_readlane_b32 s98, v44, 7 +; SI-NEXT: v_readlane_b32 s54, v44, 30 +; SI-NEXT: v_readlane_b32 s55, v44, 20 +; SI-NEXT: v_readlane_b32 s51, v44, 21 +; SI-NEXT: v_readlane_b32 s41, v44, 15 +; SI-NEXT: v_readlane_b32 s50, v44, 28 +; SI-NEXT: v_readlane_b32 s13, v44, 29 +; SI-NEXT: v_readlane_b32 s14, v44, 14 +; SI-NEXT: v_readlane_b32 s44, v44, 6 +; SI-NEXT: v_readlane_b32 s9, v44, 12 +; SI-NEXT: v_readlane_b32 s81, v44, 13 +; SI-NEXT: v_readlane_b32 s82, v44, 10 +; SI-NEXT: v_readlane_b32 s10, v44, 17 +; SI-NEXT: v_readlane_b32 s7, v44, 4 +; SI-NEXT: v_readlane_b32 s12, v44, 5 +; SI-NEXT: v_readlane_b32 s96, v44, 8 +; SI-NEXT: v_readlane_b32 s83, v44, 9 +; SI-NEXT: v_readlane_b32 s71, v44, 16 ; SI-NEXT: ; kill: killed $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; kill: killed $sgpr8 @@ -156425,50 +156441,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr20 -; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr29 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr22 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $sgpr25 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr99 -; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; kill: killed $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; kill: killed $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr97 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr61 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr87 ; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr68 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr70 @@ -182467,12 +182481,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 ; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane -; SI-NEXT: s_mov_b32 s10, s16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_writelane_b32 v61, s29, 0 -; SI-NEXT: v_writelane_b32 v61, s28, 1 -; SI-NEXT: v_writelane_b32 v61, s27, 2 -; SI-NEXT: s_mov_b32 s61, s21 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_writelane_b32 v63, s30, 0 ; SI-NEXT: v_writelane_b32 v63, s31, 1 ; SI-NEXT: v_writelane_b32 v63, s34, 2 @@ -182506,59 +182515,58 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_writelane_b32 v63, s86, 30 ; SI-NEXT: v_writelane_b32 v63, s87, 31 ; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: s_mov_b32 s10, s16 ; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: s_mov_b32 s67, s19 -; SI-NEXT: s_mov_b32 s54, s17 -; SI-NEXT: s_mov_b32 s35, s23 -; SI-NEXT: s_mov_b32 s39, s26 -; SI-NEXT: s_mov_b32 s62, s25 +; SI-NEXT: s_mov_b32 s50, s29 ; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: v_readfirstlane_b32 s99, v1 -; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_readfirstlane_b32 s44, v22 ; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s6, v23 +; SI-NEXT: v_readfirstlane_b32 s73, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v62, s74, 0 -; SI-NEXT: v_readfirstlane_b32 s12, v26 -; SI-NEXT: v_writelane_b32 v62, s6, 1 +; SI-NEXT: v_writelane_b32 v62, s44, 0 +; SI-NEXT: v_readfirstlane_b32 s74, v24 +; SI-NEXT: v_writelane_b32 v62, s73, 1 +; SI-NEXT: s_mov_b32 s60, s20 +; SI-NEXT: v_readfirstlane_b32 s69, v23 +; SI-NEXT: v_writelane_b32 v62, s74, 2 +; SI-NEXT: v_readfirstlane_b32 s11, v26 +; SI-NEXT: v_writelane_b32 v62, s69, 3 ; SI-NEXT: v_readfirstlane_b32 s14, v25 -; SI-NEXT: v_writelane_b32 v62, s12, 2 +; SI-NEXT: v_writelane_b32 v62, s11, 4 ; SI-NEXT: v_readfirstlane_b32 s46, v28 -; SI-NEXT: v_writelane_b32 v62, s14, 3 +; SI-NEXT: v_writelane_b32 v62, s14, 5 +; SI-NEXT: s_mov_b32 s68, s24 ; SI-NEXT: v_readfirstlane_b32 s56, v27 -; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_writelane_b32 v62, s46, 6 +; SI-NEXT: s_mov_b32 s76, s27 ; SI-NEXT: v_readfirstlane_b32 s57, v30 -; SI-NEXT: v_writelane_b32 v62, s56, 5 -; SI-NEXT: v_readfirstlane_b32 s59, v29 -; SI-NEXT: v_writelane_b32 v62, s57, 6 -; SI-NEXT: v_writelane_b32 v62, s59, 7 -; SI-NEXT: s_mov_b32 s60, s20 -; SI-NEXT: s_mov_b32 s63, s24 +; SI-NEXT: v_writelane_b32 v62, s56, 7 +; SI-NEXT: v_writelane_b32 v62, s57, 8 ; SI-NEXT: v_readfirstlane_b32 s95, v3 ; SI-NEXT: v_readfirstlane_b32 s31, v5 ; SI-NEXT: v_readfirstlane_b32 s24, v9 ; SI-NEXT: v_readfirstlane_b32 s38, v12 ; SI-NEXT: v_readfirstlane_b32 s36, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s27, v13 ; SI-NEXT: v_readfirstlane_b32 s9, v16 ; SI-NEXT: v_readfirstlane_b32 s79, v15 ; SI-NEXT: v_readfirstlane_b32 s13, v18 -; SI-NEXT: v_readfirstlane_b32 s15, v17 +; SI-NEXT: v_readfirstlane_b32 s40, v17 ; SI-NEXT: v_readfirstlane_b32 s42, v20 ; SI-NEXT: v_readfirstlane_b32 s43, v19 -; SI-NEXT: v_readfirstlane_b32 s44, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 -; SI-NEXT: v_writelane_b32 v61, s4, 3 -; SI-NEXT: v_readfirstlane_b32 s45, v21 +; SI-NEXT: v_readfirstlane_b32 s89, v29 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v13 ; SI-NEXT: v_readfirstlane_b32 s98, v10 ; SI-NEXT: v_readfirstlane_b32 s90, v8 ; SI-NEXT: v_readfirstlane_b32 s88, v7 ; SI-NEXT: v_readfirstlane_b32 s91, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 +; SI-NEXT: v_writelane_b32 v61, s4, 0 ; SI-NEXT: v_readfirstlane_b32 s55, v2 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill @@ -182577,389 +182585,374 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 -; SI-NEXT: v_writelane_b32 v61, s4, 4 +; SI-NEXT: v_writelane_b32 v61, s4, 1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:320 -; SI-NEXT: v_writelane_b32 v61, s4, 5 +; SI-NEXT: v_writelane_b32 v61, s4, 2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: v_writelane_b32 v61, s4, 6 +; SI-NEXT: v_writelane_b32 v61, s4, 3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 -; SI-NEXT: v_writelane_b32 v61, s4, 7 +; SI-NEXT: v_writelane_b32 v61, s4, 4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:308 -; SI-NEXT: v_writelane_b32 v61, s4, 8 +; SI-NEXT: v_writelane_b32 v61, s4, 5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 -; SI-NEXT: v_writelane_b32 v61, s4, 9 +; SI-NEXT: v_writelane_b32 v61, s4, 6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 -; SI-NEXT: v_writelane_b32 v61, s4, 10 +; SI-NEXT: v_writelane_b32 v61, s4, 7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 -; SI-NEXT: v_writelane_b32 v61, s4, 11 +; SI-NEXT: v_writelane_b32 v61, s4, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:292 -; SI-NEXT: v_writelane_b32 v61, s4, 12 +; SI-NEXT: v_writelane_b32 v61, s4, 9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:288 -; SI-NEXT: v_writelane_b32 v61, s4, 13 +; SI-NEXT: v_writelane_b32 v61, s4, 10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 -; SI-NEXT: v_writelane_b32 v61, s4, 14 +; SI-NEXT: v_writelane_b32 v61, s4, 11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 -; SI-NEXT: v_writelane_b32 v61, s4, 15 +; SI-NEXT: v_writelane_b32 v61, s4, 12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s67, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s54, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:272 -; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s65, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 -; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s70, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 -; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s71, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 -; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s49, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256 -; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s83, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 -; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s80, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:248 -; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s82, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:244 -; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s84, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 -; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s87, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:236 -; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s86, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 -; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s51, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:228 -; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s96, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 -; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v61, s4, 30 +; SI-NEXT: v_writelane_b32 v61, s4, 13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s94, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:216 -; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 -; SI-NEXT: v_writelane_b32 v61, s4, 32 +; SI-NEXT: v_writelane_b32 v61, s4, 14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s16, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v61, s4, 33 +; SI-NEXT: v_writelane_b32 v61, s4, 15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: v_readfirstlane_b32 s15, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 -; SI-NEXT: v_writelane_b32 v61, s4, 34 +; SI-NEXT: v_writelane_b32 v61, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s73, v31 +; SI-NEXT: v_readfirstlane_b32 s45, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:192 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:188 -; SI-NEXT: v_writelane_b32 v61, s4, 35 +; SI-NEXT: v_writelane_b32 v61, s4, 17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s72, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 +; SI-NEXT: v_writelane_b32 v61, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s40, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:180 +; SI-NEXT: v_writelane_b32 v61, s4, 19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: v_readfirstlane_b32 s85, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:176 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s85, v31 +; SI-NEXT: v_readfirstlane_b32 s81, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:172 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s81, v31 +; SI-NEXT: v_readfirstlane_b32 s97, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s97, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:164 +; SI-NEXT: v_writelane_b32 v61, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s7, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: v_writelane_b32 v61, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s11, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 +; SI-NEXT: v_writelane_b32 v61, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s41, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s47, v31 +; SI-NEXT: v_readfirstlane_b32 s12, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_readfirstlane_b32 s58, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s76, v31 +; SI-NEXT: v_readfirstlane_b32 s47, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s29, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; SI-NEXT: v_writelane_b32 v61, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_readfirstlane_b32 s59, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: v_writelane_b32 v61, s4, 36 -; SI-NEXT: v_writelane_b32 v61, s54, 37 -; SI-NEXT: v_writelane_b32 v61, s10, 38 -; SI-NEXT: v_writelane_b32 v61, s67, 39 -; SI-NEXT: v_writelane_b32 v61, s18, 40 -; SI-NEXT: v_writelane_b32 v61, s61, 41 -; SI-NEXT: v_writelane_b32 v61, s60, 42 -; SI-NEXT: v_writelane_b32 v61, s35, 43 -; SI-NEXT: v_writelane_b32 v61, s22, 44 -; SI-NEXT: v_writelane_b32 v61, s62, 45 -; SI-NEXT: v_writelane_b32 v61, s63, 46 -; SI-NEXT: v_writelane_b32 v61, s39, 47 -; SI-NEXT: v_writelane_b32 v61, s99, 48 -; SI-NEXT: v_writelane_b32 v61, s95, 49 -; SI-NEXT: v_writelane_b32 v61, s31, 50 -; SI-NEXT: v_writelane_b32 v61, s24, 51 -; SI-NEXT: v_writelane_b32 v61, s38, 52 -; SI-NEXT: v_writelane_b32 v61, s36, 53 -; SI-NEXT: v_writelane_b32 v61, s8, 54 -; SI-NEXT: v_writelane_b32 v61, s27, 55 -; SI-NEXT: v_writelane_b32 v61, s9, 56 -; SI-NEXT: v_writelane_b32 v61, s79, 57 -; SI-NEXT: v_writelane_b32 v61, s13, 58 -; SI-NEXT: v_writelane_b32 v61, s15, 59 -; SI-NEXT: v_writelane_b32 v61, s42, 60 -; SI-NEXT: v_writelane_b32 v61, s43, 61 -; SI-NEXT: v_writelane_b32 v61, s44, 62 -; SI-NEXT: v_writelane_b32 v61, s45, 63 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s37, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; SI-NEXT: v_writelane_b32 v61, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s50, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:124 +; SI-NEXT: v_writelane_b32 v61, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s48, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 +; SI-NEXT: v_writelane_b32 v61, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s19, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; SI-NEXT: v_writelane_b32 v61, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s64, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; SI-NEXT: v_writelane_b32 v61, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s17, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; SI-NEXT: v_writelane_b32 v61, s4, 29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s65, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v61, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s71, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: v_writelane_b32 v61, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s70, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; SI-NEXT: v_writelane_b32 v61, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s83, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: v_writelane_b32 v61, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s49, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; SI-NEXT: v_writelane_b32 v61, s4, 34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s80, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 +; SI-NEXT: v_writelane_b32 v61, s4, 35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s82, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 +; SI-NEXT: v_writelane_b32 v61, s4, 36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s87, v31 +; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: v_writelane_b32 v61, s4, 37 +; SI-NEXT: v_writelane_b32 v61, s17, 38 +; SI-NEXT: v_writelane_b32 v61, s10, 39 +; SI-NEXT: v_writelane_b32 v61, s19, 40 +; SI-NEXT: v_writelane_b32 v61, s18, 41 +; SI-NEXT: v_writelane_b32 v61, s21, 42 +; SI-NEXT: v_writelane_b32 v61, s60, 43 +; SI-NEXT: v_writelane_b32 v61, s23, 44 +; SI-NEXT: v_writelane_b32 v61, s22, 45 +; SI-NEXT: v_writelane_b32 v61, s25, 46 +; SI-NEXT: v_writelane_b32 v61, s68, 47 +; SI-NEXT: v_writelane_b32 v61, s76, 48 +; SI-NEXT: v_writelane_b32 v61, s26, 49 +; SI-NEXT: v_writelane_b32 v61, s50, 50 +; SI-NEXT: v_writelane_b32 v61, s99, 51 +; SI-NEXT: v_writelane_b32 v61, s28, 52 +; SI-NEXT: v_writelane_b32 v61, s95, 53 +; SI-NEXT: v_writelane_b32 v61, s31, 54 +; SI-NEXT: v_writelane_b32 v61, s24, 55 +; SI-NEXT: v_writelane_b32 v61, s38, 56 +; SI-NEXT: v_writelane_b32 v61, s36, 57 +; SI-NEXT: v_writelane_b32 v61, s9, 58 +; SI-NEXT: v_writelane_b32 v61, s79, 59 +; SI-NEXT: v_writelane_b32 v61, s13, 60 +; SI-NEXT: v_writelane_b32 v61, s40, 61 +; SI-NEXT: v_writelane_b32 v61, s42, 62 +; SI-NEXT: v_writelane_b32 v61, s43, 63 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s84, v31 +; SI-NEXT: v_readfirstlane_b32 s78, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s51, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s86, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s94, v31 +; SI-NEXT: v_readfirstlane_b32 s77, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s96, v31 +; SI-NEXT: v_readfirstlane_b32 s30, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s68, v31 +; SI-NEXT: v_readfirstlane_b32 s92, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s34, v31 +; SI-NEXT: v_readfirstlane_b32 s35, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: v_readfirstlane_b32 s39, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s66, v31 +; SI-NEXT: v_readfirstlane_b32 s64, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s78, v31 +; SI-NEXT: v_readfirstlane_b32 s48, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_readfirstlane_b32 s52, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s69, v31 +; SI-NEXT: v_readfirstlane_b32 s37, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s30, v31 +; SI-NEXT: v_readfirstlane_b32 s63, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s52, v31 +; SI-NEXT: v_readfirstlane_b32 s34, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s75, v31 +; SI-NEXT: v_readfirstlane_b32 s62, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s23, v31 +; SI-NEXT: v_readfirstlane_b32 s7, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s72, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s26, v31 +; SI-NEXT: v_readfirstlane_b32 s66, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s93, v31 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v62, s25, 8 -; SI-NEXT: v_writelane_b32 v62, s28, 9 +; SI-NEXT: v_writelane_b32 v62, s93, 9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s92, v31 -; SI-NEXT: v_writelane_b32 v62, s92, 10 -; SI-NEXT: v_writelane_b32 v62, s75, 11 -; SI-NEXT: v_writelane_b32 v62, s26, 12 -; SI-NEXT: v_writelane_b32 v62, s30, 13 -; SI-NEXT: v_writelane_b32 v62, s23, 14 -; SI-NEXT: v_writelane_b32 v62, s52, 15 -; SI-NEXT: v_writelane_b32 v62, s64, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s65, 18 -; SI-NEXT: v_writelane_b32 v62, s70, 19 -; SI-NEXT: v_writelane_b32 v62, s71, 20 -; SI-NEXT: v_writelane_b32 v62, s49, 21 -; SI-NEXT: v_writelane_b32 v62, s83, 22 -; SI-NEXT: v_writelane_b32 v62, s80, 23 -; SI-NEXT: v_writelane_b32 v62, s82, 24 -; SI-NEXT: v_writelane_b32 v62, s84, 25 -; SI-NEXT: v_writelane_b32 v62, s87, 26 -; SI-NEXT: v_writelane_b32 v62, s86, 27 -; SI-NEXT: v_writelane_b32 v62, s51, 28 -; SI-NEXT: v_writelane_b32 v62, s96, 29 -; SI-NEXT: v_writelane_b32 v62, s34, 30 -; SI-NEXT: v_writelane_b32 v62, s94, 31 -; SI-NEXT: v_writelane_b32 v62, s53, 32 -; SI-NEXT: v_writelane_b32 v62, s66, 33 -; SI-NEXT: v_writelane_b32 v62, s68, 34 -; SI-NEXT: v_writelane_b32 v62, s69, 35 -; SI-NEXT: v_writelane_b32 v62, s77, 36 -; SI-NEXT: v_writelane_b32 v62, s78, 37 -; SI-NEXT: s_cbranch_scc0 .LBB93_4 +; SI-NEXT: v_readfirstlane_b32 s53, v31 +; SI-NEXT: v_writelane_b32 v62, s53, 10 +; SI-NEXT: v_writelane_b32 v62, s66, 11 +; SI-NEXT: v_writelane_b32 v62, s7, 12 +; SI-NEXT: v_writelane_b32 v62, s78, 13 +; SI-NEXT: v_writelane_b32 v62, s77, 14 +; SI-NEXT: v_writelane_b32 v62, s92, 15 +; SI-NEXT: v_writelane_b32 v62, s75, 16 +; SI-NEXT: v_writelane_b32 v62, s37, 17 +; SI-NEXT: v_writelane_b32 v62, s39, 18 +; SI-NEXT: v_writelane_b32 v62, s30, 19 +; SI-NEXT: v_writelane_b32 v62, s48, 20 +; SI-NEXT: v_writelane_b32 v62, s35, 21 +; SI-NEXT: v_writelane_b32 v62, s52, 22 +; SI-NEXT: v_writelane_b32 v62, s64, 23 +; SI-NEXT: s_cbranch_scc0 .LBB93_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 ; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 +; SI-NEXT: s_lshl_b32 s5, s19, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_lshl_b32 s5, s21, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: s_lshl_b32 s5, s35, 8 +; SI-NEXT: s_lshl_b32 s5, s23, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: s_and_b32 s4, s63, 0xff -; SI-NEXT: s_lshl_b32 s5, s62, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s5, v61, 2 -; SI-NEXT: s_and_b32 s4, s39, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: s_and_b32 s4, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; SI-NEXT: v_readlane_b32 s4, v61, 1 -; SI-NEXT: v_readlane_b32 s5, v61, 0 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: s_and_b32 s4, s99, 0xff ; SI-NEXT: s_lshl_b32 s5, s55, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s4 ; SI-NEXT: s_and_b32 s4, s95, 0xff -; SI-NEXT: s_lshl_b32 s5, s93, 8 +; SI-NEXT: s_lshl_b32 s5, s6, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s4 ; SI-NEXT: s_and_b32 s4, s31, 0xff @@ -182986,7 +182979,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s9, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: s_and_b32 s4, s40, 0xff ; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 @@ -182994,231 +182987,230 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xff +; SI-NEXT: s_and_b32 s4, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_and_b32 s4, s69, 0xff ; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 ; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 ; SI-NEXT: s_and_b32 s4, s56, 0xff ; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_and_b32 s4, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s92, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_lshl_b32 s5, s93, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s4 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_and_b32 s4, s7, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s4 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s30, 8 +; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s53, 8 +; SI-NEXT: s_and_b32 s4, s37, 0xff +; SI-NEXT: s_lshl_b32 s5, s52, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s4 -; SI-NEXT: s_and_b32 s4, s78, 0xff -; SI-NEXT: s_lshl_b32 s5, s66, 8 +; SI-NEXT: s_and_b32 s4, s48, 0xff +; SI-NEXT: s_lshl_b32 s5, s64, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s4 -; SI-NEXT: s_and_b32 s4, s77, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_and_b32 s4, s39, 0xff +; SI-NEXT: s_lshl_b32 s5, s35, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_and_b32 s4, s92, 0xff +; SI-NEXT: s_lshl_b32 s5, s30, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v31, s4 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s86, 8 +; SI-NEXT: s_and_b32 s4, s77, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s4 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s79, v61, 37 +; SI-NEXT: v_readlane_b32 s57, v61, 36 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: s_and_b32 s4, s87, 0xff -; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_and_b32 s4, s79, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s46, v61, 35 +; SI-NEXT: v_readlane_b32 s56, v61, 34 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s4 -; SI-NEXT: s_and_b32 s4, s80, 0xff -; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s43, v61, 33 +; SI-NEXT: v_readlane_b32 s44, v61, 32 ; SI-NEXT: v_cvt_f32_f16_e32 v35, s4 -; SI-NEXT: s_and_b32 s4, s83, 0xff -; SI-NEXT: s_lshl_b32 s5, s70, 8 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s40, v61, 31 +; SI-NEXT: v_readlane_b32 s42, v61, 30 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s22, v61, 29 +; SI-NEXT: v_readlane_b32 s26, v61, 28 ; SI-NEXT: v_cvt_f32_f16_e32 v37, s4 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s64, 8 +; SI-NEXT: s_and_b32 s4, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s26, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s28, v61, 27 +; SI-NEXT: v_readlane_b32 s18, v61, 26 ; SI-NEXT: v_cvt_f32_f16_e32 v36, s4 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s18, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s14, v61, 25 +; SI-NEXT: v_readlane_b32 s13, v61, 24 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s4 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: s_and_b32 s4, s14, 0xff +; SI-NEXT: s_lshl_b32 s5, s13, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s8, v61, 36 +; SI-NEXT: v_readlane_b32 s11, v61, 23 ; SI-NEXT: v_cvt_f32_f16_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s11, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s76, 0xff +; SI-NEXT: s_and_b32 s4, s47, 0xff ; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_and_b32 s4, s12, 0xff ; SI-NEXT: s_lshl_b32 s5, s41, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s25, v61, 22 +; SI-NEXT: v_readlane_b32 s10, v61, 21 ; SI-NEXT: v_cvt_f32_f16_e32 v51, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 +; SI-NEXT: s_and_b32 s4, s25, 0xff +; SI-NEXT: s_lshl_b32 s5, s10, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s21, v61, 20 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 -; SI-NEXT: s_and_b32 s4, s97, 0xff -; SI-NEXT: s_lshl_b32 s5, s81, 8 +; SI-NEXT: s_and_b32 s4, s21, 0xff +; SI-NEXT: s_lshl_b32 s5, s97, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s4 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s81, 0xff +; SI-NEXT: s_lshl_b32 s5, s85, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_readlane_b32 s9, v61, 19 +; SI-NEXT: v_readlane_b32 s7, v61, 18 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s69, v61, 35 +; SI-NEXT: v_readlane_b32 s68, v61, 17 ; SI-NEXT: v_cvt_f32_f16_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s73, 8 +; SI-NEXT: s_and_b32 s4, s68, 0xff +; SI-NEXT: s_lshl_b32 s5, s45, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s68, v61, 34 +; SI-NEXT: v_readlane_b32 s66, v61, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s4 -; SI-NEXT: s_and_b32 s4, s68, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 +; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s66, v61, 33 +; SI-NEXT: v_readlane_b32 s53, v61, 15 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s4 -; SI-NEXT: s_and_b32 s4, s66, 0xff +; SI-NEXT: s_and_b32 s4, s53, 0xff ; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_mov_b32 s61, s34 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s94, v61, 31 +; SI-NEXT: v_readlane_b32 s34, v61, 14 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 -; SI-NEXT: s_and_b32 s4, s53, 0xff +; SI-NEXT: s_and_b32 s4, s34, 0xff ; SI-NEXT: s_lshl_b32 s5, s94, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s96, v61, 29 +; SI-NEXT: v_readlane_b32 s93, v61, 13 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 -; SI-NEXT: s_and_b32 s4, s34, 0xff +; SI-NEXT: s_and_b32 s4, s93, 0xff ; SI-NEXT: s_lshl_b32 s5, s96, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: v_readlane_b32 s86, v61, 27 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_and_b32 s4, s51, 0xff ; SI-NEXT: s_lshl_b32 s5, s86, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s84, v61, 25 ; SI-NEXT: v_cvt_f32_f16_e32 v45, s4 ; SI-NEXT: s_and_b32 s4, s87, 0xff ; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s80, v61, 23 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 ; SI-NEXT: s_and_b32 s4, s82, 0xff ; SI-NEXT: s_lshl_b32 s5, s80, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s49, v61, 21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_and_b32 s4, s83, 0xff ; SI-NEXT: s_lshl_b32 s5, s49, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s70, v61, 19 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s4 ; SI-NEXT: s_and_b32 s4, s71, 0xff ; SI-NEXT: s_lshl_b32 s5, s70, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s54, v61, 17 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 ; SI-NEXT: s_and_b32 s4, s65, 0xff ; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: s_mov_b32 s19, s50 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s50, v61, 15 +; SI-NEXT: v_readlane_b32 s50, v61, 12 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_and_b32 s4, s67, 0xff ; SI-NEXT: s_lshl_b32 s5, s50, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s52, v61, 13 +; SI-NEXT: v_readlane_b32 s64, v61, 11 +; SI-NEXT: v_readlane_b32 s52, v61, 10 ; SI-NEXT: v_cvt_f32_f16_e32 v59, s4 ; SI-NEXT: s_and_b32 s4, s64, 0xff ; SI-NEXT: s_lshl_b32 s5, s52, 8 -; SI-NEXT: s_mov_b32 s23, s48 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s48, v61, 11 +; SI-NEXT: v_readlane_b32 s35, v61, 9 +; SI-NEXT: v_readlane_b32 s48, v61, 8 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_and_b32 s4, s35, 0xff ; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s39, v61, 9 +; SI-NEXT: v_readlane_b32 s30, v61, 7 +; SI-NEXT: v_readlane_b32 s39, v61, 6 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 ; SI-NEXT: s_and_b32 s4, s30, 0xff ; SI-NEXT: s_lshl_b32 s5, s39, 8 -; SI-NEXT: s_mov_b32 s26, s37 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s75, v61, 7 +; SI-NEXT: v_readlane_b32 s37, v61, 5 +; SI-NEXT: v_readlane_b32 s75, v61, 4 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: s_and_b32 s4, s37, 0xff ; SI-NEXT: s_lshl_b32 s5, s75, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s77, v61, 5 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s92, 0xff ; SI-NEXT: s_lshl_b32 s5, s77, 8 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s76, v61, 3 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s76, v61, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_and_b32 s4, s78, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 @@ -183226,24 +183218,151 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b32 s99, s55 ; SI-NEXT: s_mov_b32 s20, s88 ; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_mov_b32 s55, s93 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s55, s6 ; SI-NEXT: s_mov_b32 s95, s91 ; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: s_cbranch_execnz .LBB93_3 -; SI-NEXT: .LBB93_2: ; %cmp.true +; SI-NEXT: s_mov_b32 s36, s8 +; SI-NEXT: s_mov_b32 s38, s27 +; SI-NEXT: s_mov_b32 s6, s7 +; SI-NEXT: s_mov_b32 s8, s9 +; SI-NEXT: s_mov_b32 s7, s10 +; SI-NEXT: s_mov_b32 s98, s89 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s17, s72 +; SI-NEXT: s_mov_b32 s19, s62 +; SI-NEXT: s_mov_b32 s11, s13 +; SI-NEXT: s_mov_b32 s23, s63 +; SI-NEXT: s_mov_b32 s13, s18 +; SI-NEXT: s_mov_b32 s27, s22 +; SI-NEXT: s_branch .LBB93_3 +; SI-NEXT: .LBB93_2: +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s61, s34 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_mov_b32 s24, s98 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: s_mov_b32 s20, s88 +; SI-NEXT: s_mov_b32 s99, s55 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_readlane_b32 s75, v61, 4 +; SI-NEXT: v_readlane_b32 s76, v61, 0 +; SI-NEXT: v_readlane_b32 s77, v61, 2 +; SI-NEXT: v_readlane_b32 s78, v61, 1 +; SI-NEXT: v_readlane_b32 s92, v61, 3 +; SI-NEXT: v_readlane_b32 s39, v61, 6 +; SI-NEXT: v_readlane_b32 s37, v61, 5 +; SI-NEXT: v_readlane_b32 s30, v61, 7 +; SI-NEXT: v_readlane_b32 s48, v61, 8 +; SI-NEXT: v_readlane_b32 s52, v61, 10 +; SI-NEXT: v_readlane_b32 s35, v61, 9 +; SI-NEXT: v_readlane_b32 s50, v61, 12 +; SI-NEXT: v_readlane_b32 s64, v61, 11 +; SI-NEXT: s_mov_b32 s55, s6 +; SI-NEXT: s_mov_b32 s95, s91 +; SI-NEXT: s_mov_b32 s31, s90 +; SI-NEXT: s_mov_b32 s36, s8 +; SI-NEXT: s_mov_b32 s38, s27 +; SI-NEXT: v_readlane_b32 s6, v61, 18 +; SI-NEXT: v_readlane_b32 s93, v61, 13 +; SI-NEXT: v_readlane_b32 s34, v61, 14 +; SI-NEXT: v_readlane_b32 s53, v61, 15 +; SI-NEXT: v_readlane_b32 s66, v61, 16 +; SI-NEXT: v_readlane_b32 s68, v61, 17 +; SI-NEXT: v_readlane_b32 s8, v61, 19 +; SI-NEXT: v_readlane_b32 s21, v61, 20 +; SI-NEXT: v_readlane_b32 s25, v61, 22 +; SI-NEXT: v_readlane_b32 s7, v61, 21 +; SI-NEXT: s_mov_b32 s98, s89 +; SI-NEXT: v_readlane_b32 s9, v61, 23 +; SI-NEXT: s_mov_b32 s17, s72 +; SI-NEXT: s_mov_b32 s19, s62 +; SI-NEXT: v_readlane_b32 s11, v61, 24 +; SI-NEXT: s_mov_b32 s23, s63 +; SI-NEXT: v_readlane_b32 s13, v61, 26 +; SI-NEXT: v_readlane_b32 s14, v61, 25 +; SI-NEXT: v_readlane_b32 s26, v61, 28 +; SI-NEXT: v_readlane_b32 s27, v61, 29 +; SI-NEXT: v_readlane_b32 s28, v61, 27 +; SI-NEXT: v_readlane_b32 s40, v61, 31 +; SI-NEXT: v_readlane_b32 s42, v61, 30 +; SI-NEXT: v_readlane_b32 s43, v61, 33 +; SI-NEXT: v_readlane_b32 s44, v61, 32 +; SI-NEXT: v_readlane_b32 s46, v61, 35 +; SI-NEXT: v_readlane_b32 s56, v61, 34 +; SI-NEXT: v_readlane_b32 s57, v61, 36 +; SI-NEXT: v_readlane_b32 s79, v61, 37 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: .LBB93_3: ; %Flow +; SI-NEXT: s_mov_b32 s88, s29 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB93_5 +; SI-NEXT: ; %bb.4: ; %cmp.true ; SI-NEXT: s_add_i32 s4, s78, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s76, 8 @@ -183261,6 +183380,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s60, s39, 8 ; SI-NEXT: s_or_b32 s60, s60, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s35, 3 +; SI-NEXT: s_mov_b32 s90, s20 +; SI-NEXT: s_mov_b32 s20, s61 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s61, s48, 8 ; SI-NEXT: s_or_b32 s61, s61, vcc_hi @@ -183293,228 +183414,209 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_lshl_b32 s76, s84, 8 ; SI-NEXT: s_or_b32 s76, s76, vcc_hi ; SI-NEXT: s_add_i32 vcc_hi, s51, 3 -; SI-NEXT: s_add_i32 s93, s53, 3 +; SI-NEXT: s_add_i32 s89, s93, 3 +; SI-NEXT: s_add_i32 s93, s34, 3 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s77, s86, 8 -; SI-NEXT: s_add_i32 s89, s34, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff ; SI-NEXT: s_lshl_b32 s78, s94, 8 -; SI-NEXT: s_add_i32 s34, s66, 3 +; SI-NEXT: s_add_i32 s34, s53, 3 ; SI-NEXT: s_or_b32 s77, s77, vcc_hi ; SI-NEXT: s_and_b32 s89, s89, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s96, 8 ; SI-NEXT: s_or_b32 s22, s78, s93 ; SI-NEXT: s_and_b32 s93, s34, 0xff ; SI-NEXT: s_lshl_b32 s92, s16, 8 -; SI-NEXT: s_add_i32 s53, s68, 3 +; SI-NEXT: s_add_i32 s53, s66, 3 ; SI-NEXT: s_or_b32 s89, vcc_hi, s89 ; SI-NEXT: s_or_b32 s92, s92, s93 ; SI-NEXT: s_and_b32 s93, s53, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s15, 8 -; SI-NEXT: s_add_i32 s66, s69, 3 +; SI-NEXT: s_add_i32 s66, s68, 3 ; SI-NEXT: s_or_b32 s93, vcc_hi, s93 ; SI-NEXT: s_and_b32 vcc_hi, s66, 0xff ; SI-NEXT: s_lshl_b32 s34, s45, 8 -; SI-NEXT: s_add_i32 s68, s6, 3 +; SI-NEXT: s_add_i32 s68, s8, 3 ; SI-NEXT: s_or_b32 vcc_hi, s34, vcc_hi ; SI-NEXT: s_and_b32 s34, s68, 0xff -; SI-NEXT: s_lshl_b32 s39, s40, 8 +; SI-NEXT: s_lshl_b32 s39, s6, 8 ; SI-NEXT: s_add_i32 s69, s81, 3 ; SI-NEXT: s_or_b32 s34, s39, s34 ; SI-NEXT: s_and_b32 s39, s69, 0xff -; SI-NEXT: s_lshl_b32 s52, s21, 8 -; SI-NEXT: s_add_i32 s81, s7, 3 +; SI-NEXT: s_lshl_b32 s52, s85, 8 +; SI-NEXT: s_add_i32 s81, s21, 3 ; SI-NEXT: s_or_b32 s39, s52, s39 ; SI-NEXT: s_and_b32 s52, s81, 0xff ; SI-NEXT: s_lshl_b32 s53, s97, 8 -; SI-NEXT: s_add_i32 s85, s12, 3 +; SI-NEXT: s_add_i32 s85, s25, 3 ; SI-NEXT: s_or_b32 s52, s53, s52 ; SI-NEXT: s_and_b32 s53, s85, 0xff -; SI-NEXT: s_lshl_b32 s64, s11, 8 -; SI-NEXT: s_add_i32 s97, s56, 3 +; SI-NEXT: s_lshl_b32 s64, s7, 8 +; SI-NEXT: s_add_i32 s97, s12, 3 ; SI-NEXT: s_or_b32 s53, s64, s53 ; SI-NEXT: s_and_b32 s64, s97, 0xff -; SI-NEXT: s_lshl_b32 s66, s46, 8 -; SI-NEXT: s_add_i32 s21, s29, 3 +; SI-NEXT: s_lshl_b32 s66, s41, 8 +; SI-NEXT: s_add_i32 s21, s47, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 12 +; SI-NEXT: s_mov_b32 s91, s24 ; SI-NEXT: s_or_b32 s64, s66, s64 ; SI-NEXT: s_and_b32 s21, s21, 0xff -; SI-NEXT: s_lshl_b32 s66, s59, 8 -; SI-NEXT: s_add_i32 s25, s8, 3 +; SI-NEXT: s_lshl_b32 s66, s58, 8 +; SI-NEXT: s_add_i32 s25, s59, 3 +; SI-NEXT: s_add_i32 s24, s16, 3 +; SI-NEXT: v_readlane_b32 s16, v62, 11 ; SI-NEXT: s_or_b32 s66, s66, s21 ; SI-NEXT: s_and_b32 s21, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s28, 8 -; SI-NEXT: s_add_i32 s29, s19, 3 +; SI-NEXT: s_lshl_b32 s6, s9, 8 +; SI-NEXT: s_add_i32 s29, s14, 3 +; SI-NEXT: s_add_i32 s7, s27, 3 +; SI-NEXT: s_add_i32 s27, s20, 3 +; SI-NEXT: s_add_i32 s20, s16, 3 ; SI-NEXT: s_or_b32 s67, s6, s21 ; SI-NEXT: s_and_b32 s6, s29, 0xff -; SI-NEXT: s_lshl_b32 s18, s26, 8 -; SI-NEXT: s_add_i32 s28, s17, 3 -; SI-NEXT: s_or_b32 s68, s18, s6 -; SI-NEXT: s_and_b32 s6, s28, 0xff -; SI-NEXT: s_lshl_b32 s18, s23, 8 -; SI-NEXT: s_or_b32 s69, s18, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 17 -; SI-NEXT: s_add_i32 s7, s6, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 15 -; SI-NEXT: s_and_b32 s6, s7, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 16 -; SI-NEXT: s_add_i32 s27, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 13 -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_lshl_b32 s23, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 14 -; SI-NEXT: s_mov_b32 s91, s24 -; SI-NEXT: s_or_b32 s70, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 20 -; SI-NEXT: s_add_i32 s24, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 11 -; SI-NEXT: s_add_i32 s11, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 18 -; SI-NEXT: s_lshl_b32 s19, s16, 8 -; SI-NEXT: v_readlane_b32 s16, v62, 12 -; SI-NEXT: s_mov_b32 s90, s20 -; SI-NEXT: s_and_b32 s6, s11, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 -; SI-NEXT: s_add_i32 s20, s16, 3 -; SI-NEXT: v_readlane_b32 s16, v62, 9 -; SI-NEXT: s_or_b32 s71, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 22 +; SI-NEXT: s_lshl_b32 s18, s11, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s20, s20, 0xff -; SI-NEXT: s_lshl_b32 s17, s16, 8 +; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: v_readlane_b32 s16, v62, 10 -; SI-NEXT: s_add_i32 s12, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s68, s18, s6 +; SI-NEXT: s_and_b32 s6, s28, 0xff +; SI-NEXT: s_lshl_b32 s18, s13, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_lshl_b32 s19, s19, 8 ; SI-NEXT: s_or_b32 s17, s17, s20 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s20, v62, 8 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s20, v62, 9 +; SI-NEXT: s_or_b32 s69, s18, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s26, 8 +; SI-NEXT: s_add_i32 s11, s40, 3 +; SI-NEXT: s_or_b32 s19, s19, s24 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s20, s20, 8 -; SI-NEXT: s_or_b32 s81, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 23 -; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_add_i32 s98, s98, 3 +; SI-NEXT: v_readlane_b32 s24, v62, 8 +; SI-NEXT: s_or_b32 s70, s7, s6 +; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_lshl_b32 s7, s42, 8 +; SI-NEXT: s_add_i32 s12, s43, 3 ; SI-NEXT: s_or_b32 s16, s20, s16 -; SI-NEXT: v_readlane_b32 s20, v62, 7 -; SI-NEXT: s_add_i32 s14, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 21 -; SI-NEXT: s_or_b32 s19, s19, s24 -; SI-NEXT: s_add_i32 s98, s20, 3 -; SI-NEXT: v_readlane_b32 s24, v62, 6 -; SI-NEXT: s_and_b32 s6, s14, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_and_b32 s20, s98, 0xff ; SI-NEXT: s_lshl_b32 s24, s24, 8 -; SI-NEXT: s_or_b32 s83, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 26 +; SI-NEXT: s_or_b32 s71, s7, s6 +; SI-NEXT: s_and_b32 s6, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s44, 8 +; SI-NEXT: s_add_i32 s14, s46, 3 ; SI-NEXT: s_and_b32 s27, s27, 0xff +; SI-NEXT: s_lshl_b32 s23, s23, 8 ; SI-NEXT: s_or_b32 s20, s24, s20 -; SI-NEXT: v_readlane_b32 s24, v62, 5 -; SI-NEXT: s_add_i32 s41, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 24 +; SI-NEXT: v_readlane_b32 s24, v62, 7 +; SI-NEXT: s_or_b32 s81, s7, s6 +; SI-NEXT: s_and_b32 s6, s14, 0xff +; SI-NEXT: s_lshl_b32 s7, s56, 8 +; SI-NEXT: s_add_i32 s41, s79, 3 ; SI-NEXT: s_or_b32 s23, s23, s27 ; SI-NEXT: s_add_i32 s86, s24, 3 -; SI-NEXT: v_readlane_b32 s27, v62, 4 +; SI-NEXT: v_readlane_b32 s27, v62, 6 +; SI-NEXT: s_or_b32 s83, s7, s6 ; SI-NEXT: s_and_b32 s6, s41, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_lshl_b32 s7, s57, 8 ; SI-NEXT: s_and_b32 s24, s86, 0xff ; SI-NEXT: s_lshl_b32 s27, s27, 8 ; SI-NEXT: s_or_b32 s85, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 28 +; SI-NEXT: s_add_i32 s46, s88, 3 +; SI-NEXT: v_readlane_b32 s7, v62, 13 ; SI-NEXT: s_or_b32 s24, s27, s24 -; SI-NEXT: v_readlane_b32 s27, v62, 3 -; SI-NEXT: s_add_i32 s46, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 25 -; SI-NEXT: s_add_i32 s12, s73, 0x300 -; SI-NEXT: s_add_i32 s82, s27, 3 -; SI-NEXT: v_readlane_b32 s73, v62, 2 +; SI-NEXT: v_readlane_b32 s27, v62, 5 ; SI-NEXT: s_and_b32 s6, s46, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s12, s73, 0x300 +; SI-NEXT: s_add_i32 s82, s27, 3 +; SI-NEXT: v_readlane_b32 s73, v62, 4 +; SI-NEXT: s_or_b32 s96, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 14 ; SI-NEXT: s_and_b32 s27, s82, 0xff ; SI-NEXT: s_lshl_b32 s73, s73, 8 -; SI-NEXT: s_or_b32 s96, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 31 -; SI-NEXT: s_or_b32 s27, s73, s27 -; SI-NEXT: v_readlane_b32 s73, v62, 1 ; SI-NEXT: s_add_i32 s47, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 27 -; SI-NEXT: s_add_i32 s13, s74, 0x300 -; SI-NEXT: s_add_i32 s65, s73, 3 -; SI-NEXT: v_readlane_b32 s74, v62, 0 +; SI-NEXT: v_readlane_b32 s7, v62, 16 +; SI-NEXT: s_or_b32 s27, s73, s27 +; SI-NEXT: v_readlane_b32 s73, v62, 3 ; SI-NEXT: s_and_b32 s6, s47, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s13, s74, 0x300 +; SI-NEXT: s_add_i32 s65, s73, 3 +; SI-NEXT: v_readlane_b32 s74, v62, 2 +; SI-NEXT: s_or_b32 s97, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 15 ; SI-NEXT: s_and_b32 s73, s65, 0xff ; SI-NEXT: s_lshl_b32 s74, s74, 8 -; SI-NEXT: s_or_b32 s97, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 34 -; SI-NEXT: s_or_b32 s73, s74, s73 -; SI-NEXT: v_readlane_b32 s74, v61, 63 ; SI-NEXT: s_add_i32 s56, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 29 -; SI-NEXT: s_add_i32 s14, s75, 0x300 -; SI-NEXT: s_add_i32 s54, s74, 3 -; SI-NEXT: v_readlane_b32 s75, v61, 62 +; SI-NEXT: v_readlane_b32 s7, v62, 19 +; SI-NEXT: s_or_b32 s73, s74, s73 +; SI-NEXT: v_readlane_b32 s74, v62, 1 ; SI-NEXT: s_and_b32 s6, s56, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s14, s75, 0x300 +; SI-NEXT: s_add_i32 s54, s74, 3 +; SI-NEXT: v_readlane_b32 s75, v62, 0 +; SI-NEXT: s_or_b32 s63, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 18 ; SI-NEXT: s_and_b32 s74, s54, 0xff ; SI-NEXT: s_lshl_b32 s75, s75, 8 -; SI-NEXT: s_or_b32 s63, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 36 -; SI-NEXT: s_or_b32 s74, s75, s74 -; SI-NEXT: v_readlane_b32 s75, v61, 61 ; SI-NEXT: s_add_i32 s58, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 30 -; SI-NEXT: s_add_i32 s15, s76, 0x300 -; SI-NEXT: s_add_i32 s50, s75, 3 -; SI-NEXT: v_readlane_b32 s76, v61, 60 +; SI-NEXT: v_readlane_b32 s7, v62, 21 +; SI-NEXT: s_or_b32 s74, s75, s74 +; SI-NEXT: v_readlane_b32 s75, v61, 63 ; SI-NEXT: s_and_b32 s6, s58, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s15, s76, 0x300 +; SI-NEXT: s_add_i32 s50, s75, 3 +; SI-NEXT: v_readlane_b32 s76, v61, 62 +; SI-NEXT: s_or_b32 s79, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 20 ; SI-NEXT: s_and_b32 s75, s50, 0xff ; SI-NEXT: s_lshl_b32 s76, s76, 8 -; SI-NEXT: s_or_b32 s79, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 37 -; SI-NEXT: s_or_b32 s75, s76, s75 -; SI-NEXT: v_readlane_b32 s76, v61, 59 ; SI-NEXT: s_add_i32 s59, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 33 -; SI-NEXT: s_add_i32 s18, s77, 0x300 -; SI-NEXT: s_add_i32 s48, s76, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 58 +; SI-NEXT: v_readlane_b32 s7, v62, 23 +; SI-NEXT: s_or_b32 s75, s76, s75 +; SI-NEXT: v_readlane_b32 s76, v61, 61 ; SI-NEXT: s_and_b32 s6, s59, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_add_i32 s18, s77, 0x300 +; SI-NEXT: s_add_i32 s48, s76, 3 +; SI-NEXT: v_readlane_b32 s77, v61, 60 +; SI-NEXT: s_or_b32 s78, s7, s6 +; SI-NEXT: v_readlane_b32 s6, v62, 17 ; SI-NEXT: s_and_b32 s76, s48, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 8 -; SI-NEXT: s_or_b32 s78, s7, s6 -; SI-NEXT: v_readlane_b32 s6, v62, 35 -; SI-NEXT: s_or_b32 s76, s77, s76 -; SI-NEXT: v_readlane_b32 s77, v61, 57 ; SI-NEXT: s_add_i32 s57, s6, 3 -; SI-NEXT: v_readlane_b32 s7, v62, 32 +; SI-NEXT: v_readlane_b32 s7, v62, 22 +; SI-NEXT: s_or_b32 s76, s77, s76 +; SI-NEXT: v_readlane_b32 s77, v61, 59 +; SI-NEXT: s_and_b32 s6, s57, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s11, s72, 0x300 ; SI-NEXT: s_add_i32 s72, s79, 0x300 ; SI-NEXT: s_add_i32 s37, s77, 3 -; SI-NEXT: v_readlane_b32 s79, v61, 56 -; SI-NEXT: s_and_b32 s6, s57, 0xff -; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: v_readlane_b32 s79, v61, 58 +; SI-NEXT: s_or_b32 s88, s7, s6 ; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s79, s79, 8 -; SI-NEXT: s_or_b32 s88, s7, s6 -; SI-NEXT: s_or_b32 s77, s79, s77 -; SI-NEXT: v_readlane_b32 s79, v61, 55 +; SI-NEXT: s_add_i32 s35, s38, 3 ; SI-NEXT: s_add_i32 s21, s89, 0x300 ; SI-NEXT: s_add_i32 s89, s88, 0x300 -; SI-NEXT: s_add_i32 s35, s79, 3 -; SI-NEXT: v_readlane_b32 s88, v61, 54 +; SI-NEXT: s_or_b32 s77, s79, s77 ; SI-NEXT: s_and_b32 s79, s35, 0xff -; SI-NEXT: s_lshl_b32 s88, s88, 8 +; SI-NEXT: s_lshl_b32 s88, s36, 8 ; SI-NEXT: s_or_b32 s79, s88, s79 -; SI-NEXT: v_readlane_b32 s88, v61, 53 +; SI-NEXT: v_readlane_b32 s88, v61, 57 ; SI-NEXT: s_add_i32 s25, s92, 0x300 ; SI-NEXT: s_add_i32 s30, s88, 3 -; SI-NEXT: v_readlane_b32 s92, v61, 52 +; SI-NEXT: v_readlane_b32 s92, v61, 56 ; SI-NEXT: s_and_b32 s88, s30, 0xff ; SI-NEXT: s_lshl_b32 s92, s92, 8 ; SI-NEXT: s_or_b32 s88, s92, s88 -; SI-NEXT: v_readlane_b32 s92, v61, 51 +; SI-NEXT: v_readlane_b32 s92, v61, 55 ; SI-NEXT: s_add_i32 s94, s92, 3 ; SI-NEXT: s_and_b32 s92, s94, 0xff ; SI-NEXT: s_lshl_b32 s91, s91, 8 @@ -183523,52 +183625,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_and_b32 s90, s90, 0xff ; SI-NEXT: s_lshl_b32 s92, s31, 8 ; SI-NEXT: s_or_b32 s90, s92, s90 -; SI-NEXT: v_readlane_b32 s92, v61, 50 +; SI-NEXT: v_readlane_b32 s92, v61, 54 ; SI-NEXT: s_add_i32 s92, s92, 3 ; SI-NEXT: s_add_i32 s26, s93, 0x300 ; SI-NEXT: s_and_b32 s92, s92, 0xff ; SI-NEXT: s_lshl_b32 s93, s95, 8 ; SI-NEXT: s_or_b32 s92, s93, s92 -; SI-NEXT: v_readlane_b32 s93, v61, 49 +; SI-NEXT: v_readlane_b32 s93, v61, 53 ; SI-NEXT: s_add_i32 s93, s93, 3 ; SI-NEXT: s_and_b32 s93, s93, 0xff ; SI-NEXT: s_lshl_b32 s94, s55, 8 ; SI-NEXT: s_or_b32 s93, s94, s93 -; SI-NEXT: v_readlane_b32 s94, v61, 48 +; SI-NEXT: v_readlane_b32 s94, v61, 51 ; SI-NEXT: s_add_i32 s94, s94, 3 ; SI-NEXT: s_and_b32 s94, s94, 0xff ; SI-NEXT: s_lshl_b32 s95, s99, 8 ; SI-NEXT: s_or_b32 s94, s95, s94 -; SI-NEXT: v_readlane_b32 s95, v61, 1 +; SI-NEXT: v_readlane_b32 s95, v61, 52 ; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 0 +; SI-NEXT: v_readlane_b32 s30, v61, 50 ; SI-NEXT: s_add_i32 s6, vcc_lo, 0x300 ; SI-NEXT: s_and_b32 s95, s95, 0xff ; SI-NEXT: s_lshl_b32 vcc_lo, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 47 +; SI-NEXT: v_readlane_b32 s30, v61, 49 ; SI-NEXT: s_or_b32 s95, vcc_lo, s95 ; SI-NEXT: s_add_i32 vcc_lo, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 2 +; SI-NEXT: v_readlane_b32 s30, v61, 48 ; SI-NEXT: s_add_i32 s28, vcc_hi, 0x300 ; SI-NEXT: s_and_b32 vcc_lo, vcc_lo, 0xff ; SI-NEXT: s_lshl_b32 vcc_hi, s30, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 46 +; SI-NEXT: v_readlane_b32 s30, v61, 47 ; SI-NEXT: s_or_b32 vcc_lo, vcc_hi, vcc_lo ; SI-NEXT: s_add_i32 vcc_hi, s30, 3 -; SI-NEXT: v_readlane_b32 s30, v61, 45 +; SI-NEXT: v_readlane_b32 s30, v61, 46 ; SI-NEXT: s_and_b32 vcc_hi, vcc_hi, 0xff ; SI-NEXT: s_lshl_b32 s30, s30, 8 ; SI-NEXT: s_or_b32 vcc_hi, s30, vcc_hi -; SI-NEXT: v_readlane_b32 s30, v61, 44 +; SI-NEXT: v_readlane_b32 s30, v61, 45 ; SI-NEXT: s_add_i32 s30, s30, 3 -; SI-NEXT: v_readlane_b32 s31, v61, 43 +; SI-NEXT: v_readlane_b32 s31, v61, 44 ; SI-NEXT: s_and_b32 s30, s30, 0xff ; SI-NEXT: s_lshl_b32 s31, s31, 8 ; SI-NEXT: s_or_b32 s30, s31, s30 -; SI-NEXT: v_readlane_b32 s31, v61, 42 +; SI-NEXT: v_readlane_b32 s31, v61, 43 ; SI-NEXT: s_add_i32 s29, s34, 0x300 ; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: v_readlane_b32 s34, v61, 41 +; SI-NEXT: v_readlane_b32 s34, v61, 42 ; SI-NEXT: s_and_b32 s31, s31, 0xff ; SI-NEXT: s_lshl_b32 s34, s34, 8 ; SI-NEXT: s_or_b32 s31, s34, s31 @@ -183576,25 +183678,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v1, s31 ; SI-NEXT: s_addk_i32 s30, 0x300 ; SI-NEXT: s_addk_i32 vcc_hi, 0x300 -; SI-NEXT: v_readlane_b32 s34, v61, 40 +; SI-NEXT: v_readlane_b32 s34, v61, 41 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s30 ; SI-NEXT: s_add_i32 s34, s34, 3 -; SI-NEXT: v_readlane_b32 s35, v61, 39 +; SI-NEXT: v_readlane_b32 s35, v61, 40 ; SI-NEXT: s_and_b32 s34, s34, 0xff ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_hi ; SI-NEXT: s_lshl_b32 s35, s35, 8 -; SI-NEXT: s_addk_i32 vcc_lo, 0x300 +; SI-NEXT: s_addk_i32 s95, 0x300 ; SI-NEXT: s_or_b32 s34, s35, s34 -; SI-NEXT: v_readlane_b32 s35, v61, 38 +; SI-NEXT: v_readlane_b32 s35, v61, 39 ; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: v_readlane_b32 s36, v61, 37 +; SI-NEXT: v_readlane_b32 s36, v61, 38 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, vcc_lo +; SI-NEXT: v_cvt_f32_f16_e32 v1, s95 ; SI-NEXT: s_and_b32 s35, s35, 0xff ; SI-NEXT: s_lshl_b32 s36, s36, 8 ; SI-NEXT: s_or_b32 s35, s36, s35 @@ -183641,13 +183743,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_addk_i32 s92, 0x300 ; SI-NEXT: s_addk_i32 s93, 0x300 ; SI-NEXT: s_addk_i32 s94, 0x300 -; SI-NEXT: s_addk_i32 s95, 0x300 +; SI-NEXT: s_addk_i32 vcc_lo, 0x300 ; SI-NEXT: s_addk_i32 s34, 0x300 ; SI-NEXT: s_addk_i32 s35, 0x300 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s35 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, vcc_lo ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v7, s95 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s94 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s93 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s92 @@ -183706,7 +183808,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: .LBB93_3: ; %end +; SI-NEXT: .LBB93_5: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -183761,22 +183863,22 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) @@ -183993,134 +184095,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB93_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s17, s19 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_mov_b32 s19, s50 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_mov_b32 s23, s48 -; SI-NEXT: s_mov_b32 s26, s37 -; SI-NEXT: s_mov_b32 s28, s29 -; SI-NEXT: s_mov_b32 s29, s76 -; SI-NEXT: s_mov_b32 s59, s58 -; SI-NEXT: s_mov_b32 s56, s47 -; SI-NEXT: s_mov_b32 s46, s41 -; SI-NEXT: s_mov_b32 s12, s11 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: s_mov_b32 s7, s97 -; SI-NEXT: s_mov_b32 s97, s81 -; SI-NEXT: s_mov_b32 s81, s85 -; SI-NEXT: s_mov_b32 s6, s40 -; SI-NEXT: s_mov_b32 s40, s72 -; SI-NEXT: s_mov_b32 s45, s73 -; SI-NEXT: s_mov_b32 s15, s89 -; SI-NEXT: s_mov_b32 s24, s98 -; SI-NEXT: s_mov_b32 s20, s88 -; SI-NEXT: s_mov_b32 s99, s55 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_readlane_b32 s75, v61, 7 -; SI-NEXT: v_readlane_b32 s76, v61, 3 -; SI-NEXT: v_readlane_b32 s77, v61, 5 -; SI-NEXT: v_readlane_b32 s78, v61, 4 -; SI-NEXT: v_readlane_b32 s92, v61, 6 -; SI-NEXT: v_readlane_b32 s39, v61, 9 -; SI-NEXT: v_readlane_b32 s37, v61, 8 -; SI-NEXT: v_readlane_b32 s30, v61, 10 -; SI-NEXT: v_readlane_b32 s48, v61, 11 -; SI-NEXT: v_readlane_b32 s52, v61, 13 -; SI-NEXT: v_readlane_b32 s35, v61, 12 -; SI-NEXT: v_readlane_b32 s50, v61, 15 -; SI-NEXT: v_readlane_b32 s64, v61, 14 -; SI-NEXT: v_readlane_b32 s54, v61, 17 -; SI-NEXT: v_readlane_b32 s67, v61, 16 -; SI-NEXT: v_readlane_b32 s65, v61, 18 -; SI-NEXT: v_readlane_b32 s70, v61, 19 -; SI-NEXT: v_readlane_b32 s49, v61, 21 -; SI-NEXT: v_readlane_b32 s71, v61, 20 -; SI-NEXT: v_readlane_b32 s80, v61, 23 -; SI-NEXT: v_readlane_b32 s83, v61, 22 -; SI-NEXT: v_readlane_b32 s84, v61, 25 -; SI-NEXT: v_readlane_b32 s82, v61, 24 -; SI-NEXT: v_readlane_b32 s87, v61, 26 -; SI-NEXT: v_readlane_b32 s86, v61, 27 -; SI-NEXT: v_readlane_b32 s96, v61, 29 -; SI-NEXT: v_readlane_b32 s51, v61, 28 -; SI-NEXT: s_mov_b32 s55, s93 -; SI-NEXT: s_mov_b32 s95, s91 -; SI-NEXT: v_readlane_b32 s94, v61, 31 -; SI-NEXT: s_mov_b32 s31, s90 -; SI-NEXT: v_readlane_b32 s34, v61, 30 -; SI-NEXT: v_readlane_b32 s53, v61, 32 -; SI-NEXT: v_readlane_b32 s66, v61, 33 -; SI-NEXT: v_readlane_b32 s68, v61, 34 -; SI-NEXT: v_readlane_b32 s69, v61, 35 -; SI-NEXT: v_readlane_b32 s8, v61, 36 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: s_branch .LBB93_2 ; ; VI-LABEL: bitcast_v128i8_to_v64f16_scalar: ; VI: ; %bb.0: @@ -190943,24 +190917,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -190981,10 +190937,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr63 @@ -190992,7 +190947,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -191001,13 +190955,33 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -191015,9 +190989,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -191041,7 +191018,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(43) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -191100,180 +191077,195 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -191283,9 +191275,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 ; GFX9-NEXT: .LBB94_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v43, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v40 +; GFX9-NEXT: v_mov_b32_e32 v40, v55 +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB94_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 @@ -191298,12 +191294,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] @@ -191335,164 +191355,149 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -191509,41 +191514,50 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191552,84 +191566,103 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -191639,16 +191672,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -191658,23 +191691,16 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 @@ -191682,14 +191708,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -191698,11 +191728,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191711,10 +191741,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191724,11 +191754,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191737,10 +191767,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191750,11 +191780,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191763,10 +191793,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191776,11 +191806,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -191789,10 +191819,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191802,53 +191832,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -204812,36 +204823,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_writelane_b32 v41, s86, 30 ; SI-NEXT: v_writelane_b32 v41, s87, 31 ; SI-NEXT: v_writelane_b32 v41, s96, 32 -; SI-NEXT: v_writelane_b32 v41, s97, 33 -; SI-NEXT: v_writelane_b32 v41, s98, 34 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 ; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 -; SI-NEXT: v_readfirstlane_b32 s39, v26 +; SI-NEXT: v_readfirstlane_b32 s56, v12 ; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane -; SI-NEXT: v_readfirstlane_b32 s47, v12 -; SI-NEXT: v_writelane_b32 v42, s39, 0 -; SI-NEXT: v_readfirstlane_b32 s56, v11 -; SI-NEXT: v_writelane_b32 v42, s47, 1 -; SI-NEXT: v_readfirstlane_b32 s48, v24 -; SI-NEXT: v_writelane_b32 v42, s56, 2 -; SI-NEXT: v_readfirstlane_b32 s49, v23 -; SI-NEXT: v_writelane_b32 v42, s48, 3 -; SI-NEXT: v_readfirstlane_b32 s50, v21 -; SI-NEXT: v_writelane_b32 v42, s49, 4 -; SI-NEXT: v_readfirstlane_b32 s51, v22 -; SI-NEXT: v_writelane_b32 v42, s50, 5 -; SI-NEXT: v_writelane_b32 v42, s51, 6 -; SI-NEXT: v_readfirstlane_b32 s57, v20 -; SI-NEXT: v_readfirstlane_b32 s58, v19 -; SI-NEXT: v_readfirstlane_b32 s64, v29 -; SI-NEXT: v_readfirstlane_b32 s65, v30 -; SI-NEXT: v_readfirstlane_b32 s59, v28 -; SI-NEXT: v_readfirstlane_b32 s60, v27 -; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s57, v11 +; SI-NEXT: v_writelane_b32 v42, s56, 0 +; SI-NEXT: v_readfirstlane_b32 s50, v24 +; SI-NEXT: v_writelane_b32 v42, s57, 1 +; SI-NEXT: v_readfirstlane_b32 s51, v23 +; SI-NEXT: v_writelane_b32 v42, s50, 2 +; SI-NEXT: v_readfirstlane_b32 s52, v21 +; SI-NEXT: v_writelane_b32 v42, s51, 3 +; SI-NEXT: v_readfirstlane_b32 s53, v22 +; SI-NEXT: v_writelane_b32 v42, s52, 4 +; SI-NEXT: v_writelane_b32 v42, s53, 5 +; SI-NEXT: v_readfirstlane_b32 s58, v20 +; SI-NEXT: v_readfirstlane_b32 s59, v19 +; SI-NEXT: v_readfirstlane_b32 s67, v29 +; SI-NEXT: v_readfirstlane_b32 s71, v30 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_readfirstlane_b32 s60, v28 +; SI-NEXT: v_readfirstlane_b32 s61, v27 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s21, v9 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 14 @@ -204863,48 +204874,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s4, v35 ; SI-NEXT: v_writelane_b32 v43, s4, 18 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s44, v36 -; SI-NEXT: v_readfirstlane_b32 s90, v37 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_readfirstlane_b32 s24, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v38 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s15, v8 -; SI-NEXT: v_readfirstlane_b32 s18, v7 -; SI-NEXT: v_readfirstlane_b32 s21, v5 -; SI-NEXT: v_readfirstlane_b32 s22, v6 -; SI-NEXT: v_readfirstlane_b32 s40, v17 -; SI-NEXT: v_readfirstlane_b32 s41, v18 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v16 -; SI-NEXT: v_readfirstlane_b32 s77, v15 -; SI-NEXT: v_readfirstlane_b32 s38, v25 +; SI-NEXT: v_readfirstlane_b32 s9, v38 +; SI-NEXT: v_readfirstlane_b32 s22, v10 +; SI-NEXT: v_readfirstlane_b32 s40, v8 +; SI-NEXT: v_readfirstlane_b32 s41, v7 +; SI-NEXT: v_readfirstlane_b32 s42, v5 +; SI-NEXT: v_readfirstlane_b32 s43, v6 +; SI-NEXT: v_readfirstlane_b32 s76, v17 +; SI-NEXT: v_readfirstlane_b32 s77, v18 +; SI-NEXT: v_readfirstlane_b32 s46, v4 +; SI-NEXT: v_readfirstlane_b32 s47, v3 +; SI-NEXT: v_readfirstlane_b32 s38, v13 +; SI-NEXT: v_readfirstlane_b32 s39, v14 +; SI-NEXT: v_readfirstlane_b32 s48, v25 +; SI-NEXT: v_readfirstlane_b32 s49, v26 ; SI-NEXT: v_writelane_b32 v41, s99, 35 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: v_readfirstlane_b32 s95, v55 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s95, v40 +; SI-NEXT: v_readfirstlane_b32 s93, v40 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: v_writelane_b32 v43, s4, 20 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: v_writelane_b32 v43, s4, 21 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v48 -; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: v_writelane_b32 v43, s4, 22 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: v_writelane_b32 v43, s4, 23 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: v_writelane_b32 v43, s4, 24 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_readfirstlane_b32 s4, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 @@ -204915,37 +204926,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: v_readfirstlane_b32 s8, v32 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s8, v33 +; SI-NEXT: v_readfirstlane_b32 s11, v33 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 -; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: v_writelane_b32 v43, s4, 25 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: v_writelane_b32 v43, s4, 26 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_readfirstlane_b32 s4, v35 -; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: v_writelane_b32 v43, s4, 27 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_readfirstlane_b32 s4, v36 -; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: v_writelane_b32 v43, s4, 28 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 -; SI-NEXT: v_writelane_b32 v43, s4, 28 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v31 ; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s89, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s89, v38 +; SI-NEXT: v_readfirstlane_b32 s70, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s78, v39 +; SI-NEXT: v_readfirstlane_b32 s7, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s7, v48 +; SI-NEXT: v_readfirstlane_b32 s10, v48 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s82, v49 ; SI-NEXT: s_waitcnt vmcnt(7) @@ -204959,39 +204969,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: v_writelane_b32 v43, s4, 30 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: v_readfirstlane_b32 s69, v33 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 -; SI-NEXT: v_writelane_b32 v43, s4, 30 -; SI-NEXT: v_readfirstlane_b32 s4, v32 -; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: v_readfirstlane_b32 s92, v32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s18, v34 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s9, v35 +; SI-NEXT: v_readfirstlane_b32 s12, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s13, v36 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_readfirstlane_b32 s4, v37 -; SI-NEXT: v_writelane_b32 v43, s4, 33 -; SI-NEXT: v_readfirstlane_b32 s10, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 31 ; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v31 -; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: v_writelane_b32 v43, s4, 32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: v_readfirstlane_b32 s90, v38 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s4, v39 -; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_readfirstlane_b32 s91, v39 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s69, v48 +; SI-NEXT: v_readfirstlane_b32 s68, v48 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: v_readfirstlane_b32 s37, v49 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: v_readfirstlane_b32 s84, v50 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: v_readfirstlane_b32 s6, v51 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 ; SI-NEXT: s_waitcnt vmcnt(3) @@ -205007,50 +205014,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 -; SI-NEXT: v_writelane_b32 v43, s4, 37 +; SI-NEXT: v_writelane_b32 v43, s4, 33 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_readfirstlane_b32 s4, v52 -; SI-NEXT: v_writelane_b32 v43, s4, 38 +; SI-NEXT: v_writelane_b32 v43, s4, 34 ; SI-NEXT: v_readfirstlane_b32 s4, v53 -; SI-NEXT: v_writelane_b32 v43, s4, 39 +; SI-NEXT: v_writelane_b32 v43, s4, 35 ; SI-NEXT: v_readfirstlane_b32 s4, v54 -; SI-NEXT: v_writelane_b32 v43, s4, 40 -; SI-NEXT: v_writelane_b32 v43, s44, 41 -; SI-NEXT: v_writelane_b32 v43, s6, 42 -; SI-NEXT: v_writelane_b32 v43, s7, 43 -; SI-NEXT: v_writelane_b32 v43, s8, 44 -; SI-NEXT: v_writelane_b32 v43, s9, 45 -; SI-NEXT: v_writelane_b32 v43, s10, 46 -; SI-NEXT: v_writelane_b32 v43, s11, 47 -; SI-NEXT: v_writelane_b32 v43, s12, 48 -; SI-NEXT: v_writelane_b32 v43, s13, 49 -; SI-NEXT: v_writelane_b32 v43, s14, 50 -; SI-NEXT: v_writelane_b32 v43, s15, 51 -; SI-NEXT: v_writelane_b32 v43, s18, 52 -; SI-NEXT: v_writelane_b32 v43, s21, 53 -; SI-NEXT: v_writelane_b32 v43, s22, 54 -; SI-NEXT: v_writelane_b32 v43, s40, 55 -; SI-NEXT: v_writelane_b32 v43, s41, 56 -; SI-NEXT: v_writelane_b32 v43, s42, 57 -; SI-NEXT: v_writelane_b32 v43, s43, 58 -; SI-NEXT: v_writelane_b32 v43, s76, 59 -; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: v_writelane_b32 v43, s6, 37 +; SI-NEXT: v_writelane_b32 v43, s7, 38 +; SI-NEXT: v_writelane_b32 v43, s8, 39 +; SI-NEXT: v_writelane_b32 v43, s18, 40 +; SI-NEXT: v_writelane_b32 v43, s9, 41 +; SI-NEXT: v_writelane_b32 v43, s10, 42 +; SI-NEXT: v_writelane_b32 v43, s11, 43 +; SI-NEXT: v_writelane_b32 v43, s12, 44 +; SI-NEXT: v_writelane_b32 v43, s13, 45 +; SI-NEXT: v_writelane_b32 v43, s14, 46 +; SI-NEXT: v_writelane_b32 v43, s15, 47 +; SI-NEXT: v_writelane_b32 v43, s21, 48 +; SI-NEXT: v_writelane_b32 v43, s22, 49 +; SI-NEXT: v_writelane_b32 v43, s40, 50 +; SI-NEXT: v_writelane_b32 v43, s41, 51 +; SI-NEXT: v_writelane_b32 v43, s42, 52 +; SI-NEXT: v_writelane_b32 v43, s43, 53 +; SI-NEXT: v_writelane_b32 v43, s76, 54 +; SI-NEXT: v_writelane_b32 v43, s77, 55 +; SI-NEXT: v_writelane_b32 v43, s46, 56 +; SI-NEXT: v_writelane_b32 v43, s47, 57 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: v_readfirstlane_b32 s16, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: v_readfirstlane_b32 s35, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s23, v35 -; SI-NEXT: v_readfirstlane_b32 s25, v31 -; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: v_readfirstlane_b32 s19, v35 +; SI-NEXT: v_readfirstlane_b32 s28, v31 +; SI-NEXT: v_readfirstlane_b32 s29, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s26, v36 +; SI-NEXT: v_readfirstlane_b32 s87, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s88, v37 +; SI-NEXT: v_readfirstlane_b32 s79, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s79, v38 +; SI-NEXT: v_readfirstlane_b32 s27, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: v_readfirstlane_b32 s25, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 @@ -205063,39 +205071,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s24, v49 +; SI-NEXT: v_readfirstlane_b32 s88, v49 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s85, v50 +; SI-NEXT: v_readfirstlane_b32 s86, v50 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s66, v51 +; SI-NEXT: v_readfirstlane_b32 s34, v51 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13 -; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14 -; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61 -; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62 -; SI-NEXT: v_writelane_b32 v43, s38, 63 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v16 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v15 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 59 +; SI-NEXT: v_writelane_b32 v43, s38, 60 +; SI-NEXT: v_writelane_b32 v43, s39, 61 +; SI-NEXT: v_writelane_b32 v43, s48, 62 +; SI-NEXT: v_writelane_b32 v43, s49, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s20, v31 +; SI-NEXT: v_readfirstlane_b32 s17, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s19, v32 +; SI-NEXT: v_readfirstlane_b32 s20, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s27, v33 +; SI-NEXT: v_readfirstlane_b32 s94, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: v_readfirstlane_b32 s26, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s72, v35 +; SI-NEXT: v_readfirstlane_b32 s73, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s73, v36 +; SI-NEXT: v_readfirstlane_b32 s74, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s67, v37 +; SI-NEXT: v_readfirstlane_b32 s80, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s71, v38 +; SI-NEXT: v_readfirstlane_b32 s81, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s97, v39 +; SI-NEXT: v_readfirstlane_b32 s36, v39 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 @@ -205105,141 +205116,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s35, v48 +; SI-NEXT: v_readfirstlane_b32 s31, v48 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: v_readfirstlane_b32 s23, v49 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s87, v50 +; SI-NEXT: v_readfirstlane_b32 s83, v50 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s63, v51 +; SI-NEXT: v_readfirstlane_b32 s72, v51 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: v_readfirstlane_b32 s75, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s81, v32 +; SI-NEXT: v_readfirstlane_b32 s78, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_readfirstlane_b32 s80, v33 +; SI-NEXT: v_readfirstlane_b32 s97, v33 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_readfirstlane_b32 s86, v34 +; SI-NEXT: v_readfirstlane_b32 s98, v34 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_readfirstlane_b32 s34, v35 +; SI-NEXT: v_readfirstlane_b32 s30, v35 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: v_readfirstlane_b32 s85, v36 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_readfirstlane_b32 s31, v37 +; SI-NEXT: v_readfirstlane_b32 s66, v37 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_readfirstlane_b32 s61, v38 +; SI-NEXT: v_readfirstlane_b32 s62, v38 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: v_readfirstlane_b32 s63, v39 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_readfirstlane_b32 s53, v48 +; SI-NEXT: v_readfirstlane_b32 s55, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_readfirstlane_b32 s52, v49 -; SI-NEXT: v_writelane_b32 v42, s52, 7 -; SI-NEXT: v_writelane_b32 v42, s53, 8 -; SI-NEXT: v_writelane_b32 v42, s57, 9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_readfirstlane_b32 s54, v50 -; SI-NEXT: v_writelane_b32 v42, s58, 10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_readfirstlane_b32 s55, v51 -; SI-NEXT: v_writelane_b32 v42, s54, 11 -; SI-NEXT: v_writelane_b32 v42, s55, 12 -; SI-NEXT: v_writelane_b32 v42, s64, 13 -; SI-NEXT: v_writelane_b32 v42, s65, 14 -; SI-NEXT: v_writelane_b32 v42, s67, 15 -; SI-NEXT: v_writelane_b32 v42, s71, 16 -; SI-NEXT: v_writelane_b32 v42, s80, 17 -; SI-NEXT: v_writelane_b32 v42, s81, 18 -; SI-NEXT: v_writelane_b32 v42, s59, 19 -; SI-NEXT: v_writelane_b32 v42, s60, 20 -; SI-NEXT: v_writelane_b32 v42, s86, 21 -; SI-NEXT: v_writelane_b32 v42, s97, 22 +; SI-NEXT: v_readfirstlane_b32 s54, v49 +; SI-NEXT: v_writelane_b32 v42, s54, 6 +; SI-NEXT: v_writelane_b32 v42, s55, 7 +; SI-NEXT: v_writelane_b32 v42, s58, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s64, v50 +; SI-NEXT: v_writelane_b32 v42, s59, 9 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s65, v51 +; SI-NEXT: v_writelane_b32 v42, s64, 10 +; SI-NEXT: v_writelane_b32 v42, s65, 11 +; SI-NEXT: v_writelane_b32 v42, s67, 12 +; SI-NEXT: v_writelane_b32 v42, s71, 13 +; SI-NEXT: v_writelane_b32 v42, s80, 14 +; SI-NEXT: v_writelane_b32 v42, s81, 15 +; SI-NEXT: v_writelane_b32 v42, s97, 16 +; SI-NEXT: v_writelane_b32 v42, s78, 17 +; SI-NEXT: v_writelane_b32 v42, s60, 18 +; SI-NEXT: v_writelane_b32 v42, s61, 19 +; SI-NEXT: v_writelane_b32 v42, s98, 20 +; SI-NEXT: v_writelane_b32 v42, s36, 21 +; SI-NEXT: v_writelane_b32 v42, s30, 22 ; SI-NEXT: v_writelane_b32 v42, s34, 23 -; SI-NEXT: v_writelane_b32 v42, s66, 24 -; SI-NEXT: v_writelane_b32 v42, s85, 25 -; SI-NEXT: v_writelane_b32 v42, s31, 26 -; SI-NEXT: v_writelane_b32 v42, s84, 27 +; SI-NEXT: v_writelane_b32 v42, s86, 24 +; SI-NEXT: v_writelane_b32 v42, s66, 25 +; SI-NEXT: v_writelane_b32 v42, s85, 26 +; SI-NEXT: v_writelane_b32 v42, s31, 27 ; SI-NEXT: v_writelane_b32 v42, s35, 28 -; SI-NEXT: v_writelane_b32 v42, s98, 29 +; SI-NEXT: v_writelane_b32 v42, s16, 29 ; SI-NEXT: v_writelane_b32 v42, s17, 30 -; SI-NEXT: v_writelane_b32 v42, s20, 31 -; SI-NEXT: v_writelane_b32 v42, s61, 32 -; SI-NEXT: v_writelane_b32 v42, s19, 33 -; SI-NEXT: v_writelane_b32 v42, s62, 34 +; SI-NEXT: v_writelane_b32 v42, s62, 31 +; SI-NEXT: v_writelane_b32 v42, s20, 32 +; SI-NEXT: v_writelane_b32 v42, s63, 33 +; SI-NEXT: v_writelane_b32 v42, s19, 34 ; SI-NEXT: v_writelane_b32 v42, s23, 35 ; SI-NEXT: v_writelane_b32 v42, s83, 36 ; SI-NEXT: v_writelane_b32 v42, s87, 37 ; SI-NEXT: v_writelane_b32 v42, s26, 38 ; SI-NEXT: v_writelane_b32 v42, s94, 39 -; SI-NEXT: v_writelane_b32 v42, s27, 40 -; SI-NEXT: v_writelane_b32 v42, s63, 41 +; SI-NEXT: v_writelane_b32 v42, s72, 40 +; SI-NEXT: v_writelane_b32 v42, s27, 41 ; SI-NEXT: v_writelane_b32 v42, s79, 42 -; SI-NEXT: v_writelane_b32 v42, s88, 43 -; SI-NEXT: v_writelane_b32 v42, s72, 44 -; SI-NEXT: v_writelane_b32 v42, s73, 45 -; SI-NEXT: v_writelane_b32 v42, s74, 46 -; SI-NEXT: v_writelane_b32 v42, s75, 47 -; SI-NEXT: v_writelane_b32 v42, s24, 48 -; SI-NEXT: v_writelane_b32 v42, s25, 49 -; SI-NEXT: v_writelane_b32 v42, s28, 50 +; SI-NEXT: v_writelane_b32 v42, s73, 43 +; SI-NEXT: v_writelane_b32 v42, s74, 44 +; SI-NEXT: v_writelane_b32 v42, s75, 45 +; SI-NEXT: v_writelane_b32 v42, s25, 46 +; SI-NEXT: v_writelane_b32 v42, s88, 47 +; SI-NEXT: v_writelane_b32 v42, s28, 48 +; SI-NEXT: v_writelane_b32 v42, s29, 49 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_readlane_b32 s4, v43, 13 ; SI-NEXT: v_readlane_b32 s5, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s29, s4, s5 +; SI-NEXT: s_or_b32 s44, s4, s5 ; SI-NEXT: v_readlane_b32 s4, v43, 5 ; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_and_b32 s4, s47, 0xff +; SI-NEXT: s_lshl_b32 s5, s46, 8 ; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_and_b32 s4, s57, 0xff +; SI-NEXT: s_lshl_b32 s5, s56, 8 ; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_and_b32 s4, s59, 0xff +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_and_b32 s4, s61, 0xff +; SI-NEXT: s_lshl_b32 s5, s60, 8 ; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: s_and_b32 s4, s62, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_and_b32 s4, s63, 0xff +; SI-NEXT: s_lshl_b32 s5, s62, 8 ; SI-NEXT: s_or_b32 s58, s4, s5 -; SI-NEXT: s_and_b32 s4, s74, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 -; SI-NEXT: s_or_b32 s59, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_and_b32 s4, s75, 0xff ; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s73, 8 ; SI-NEXT: s_or_b32 s60, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: s_or_b32 s61, s4, s5 -; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_and_b32 s4, s88, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s29, 0xff +; SI-NEXT: s_lshl_b32 s5, s28, 8 ; SI-NEXT: s_or_b32 s62, s4, s5 -; SI-NEXT: s_and_b32 s4, s36, 0xff -; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s84, 8 ; SI-NEXT: s_or_b32 s63, s4, s5 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_and_b32 s4, s13, 0xff +; SI-NEXT: s_lshl_b32 s5, s12, 8 ; SI-NEXT: s_or_b32 s72, s4, s5 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s7, 8 ; SI-NEXT: s_or_b32 s73, s4, s5 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_and_b32 s4, s11, 0xff +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: s_or_b32 s74, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_and_b32 s4, s9, 0xff +; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_or_b32 s75, s4, s5 ; SI-NEXT: v_readlane_b32 s4, v43, 9 ; SI-NEXT: v_readlane_b32 s5, v43, 8 @@ -205258,7 +205269,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: v_writelane_b32 v42, s7, 52 ; SI-NEXT: s_or_b32 s4, s6, s4 ; SI-NEXT: s_or_b32 s5, s5, s7 ; SI-NEXT: v_readlane_b32 s6, v43, 1 @@ -205266,345 +205277,340 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_or_b32 s7, s6, s7 -; SI-NEXT: s_and_b32 s6, s11, 0xff +; SI-NEXT: s_and_b32 s6, s14, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s8, s12, 24 -; SI-NEXT: s_or_b32 s37, s8, s6 +; SI-NEXT: s_lshl_b32 s8, s15, 24 +; SI-NEXT: s_or_b32 s9, s8, s6 ; SI-NEXT: v_readlane_b32 s6, v43, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: v_readlane_b32 s8, v43, 2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: v_writelane_b32 v42, s9, 53 ; SI-NEXT: s_or_b32 s6, s8, s6 -; SI-NEXT: s_and_b32 s8, s18, 0xff -; SI-NEXT: s_lshl_b32 s9, s15, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s8, s41, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 ; SI-NEXT: s_or_b32 s9, s8, s9 -; SI-NEXT: s_and_b32 s8, s13, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s10, s14, 24 -; SI-NEXT: s_or_b32 s68, s10, s8 ; SI-NEXT: s_and_b32 s8, s21, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_lshl_b32 s10, s22, 24 +; SI-NEXT: s_or_b32 s11, s10, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: v_writelane_b32 v42, s11, 54 ; SI-NEXT: s_or_b32 s8, s10, s8 -; SI-NEXT: s_and_b32 s10, s77, 0xff -; SI-NEXT: s_lshl_b32 s11, s76, 8 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: s_and_b32 s10, vcc_hi, 0xff +; SI-NEXT: s_lshl_b32 s11, vcc_lo, 8 ; SI-NEXT: s_or_b32 s11, s10, s11 -; SI-NEXT: s_and_b32 s10, s40, 0xff +; SI-NEXT: s_and_b32 s10, s76, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, s41, 24 -; SI-NEXT: s_or_b32 s99, s12, s10 -; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s12, s77, 24 +; SI-NEXT: s_or_b32 s13, s12, s10 +; SI-NEXT: s_and_b32 s10, s38, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 16 -; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24 +; SI-NEXT: s_lshl_b32 s12, s39, 24 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: v_writelane_b32 v42, s13, 55 ; SI-NEXT: s_or_b32 s10, s12, s10 -; SI-NEXT: s_and_b32 s12, s49, 0xff -; SI-NEXT: s_lshl_b32 s13, s48, 8 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_and_b32 s12, s51, 0xff +; SI-NEXT: s_lshl_b32 s13, s50, 8 ; SI-NEXT: s_or_b32 s13, s12, s13 -; SI-NEXT: s_and_b32 s12, s38, 0xff +; SI-NEXT: s_and_b32 s12, s48, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 -; SI-NEXT: s_or_b32 s92, s14, s12 -; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s14, s49, 24 +; SI-NEXT: s_or_b32 s99, s14, s12 +; SI-NEXT: s_and_b32 s12, s52, 0xff ; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s14, s51, 24 +; SI-NEXT: s_lshl_b32 s14, s53, 24 ; SI-NEXT: s_or_b32 s12, s14, s12 -; SI-NEXT: s_and_b32 s14, s55, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: s_and_b32 s14, s65, 0xff +; SI-NEXT: s_lshl_b32 s15, s64, 8 ; SI-NEXT: s_or_b32 s15, s14, s15 -; SI-NEXT: s_and_b32 s14, s52, 0xff +; SI-NEXT: s_and_b32 s14, s54, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s24, s53, 24 -; SI-NEXT: s_mov_b32 s28, s90 -; SI-NEXT: s_or_b32 s90, s24, s14 -; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: v_writelane_b32 v42, s24, 56 +; SI-NEXT: s_lshl_b32 s24, s55, 24 +; SI-NEXT: s_or_b32 s24, s24, s14 +; SI-NEXT: s_and_b32 s14, s67, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 16 -; SI-NEXT: s_lshl_b32 s25, s65, 24 +; SI-NEXT: s_lshl_b32 s25, s71, 24 ; SI-NEXT: s_or_b32 s14, s25, s14 -; SI-NEXT: s_and_b32 s25, s34, 0xff -; SI-NEXT: s_lshl_b32 s40, s86, 8 +; SI-NEXT: s_and_b32 s25, s30, 0xff +; SI-NEXT: s_lshl_b32 s40, s98, 8 ; SI-NEXT: s_or_b32 s41, s25, s40 -; SI-NEXT: s_and_b32 s25, s80, 0xff +; SI-NEXT: s_and_b32 s25, s97, 0xff ; SI-NEXT: s_lshl_b32 s25, s25, 16 -; SI-NEXT: s_lshl_b32 s40, s81, 24 -; SI-NEXT: s_or_b32 s18, s40, s25 -; SI-NEXT: s_and_b32 s40, s31, 0xff +; SI-NEXT: s_lshl_b32 s40, s78, 24 +; SI-NEXT: s_or_b32 s25, s40, s25 +; SI-NEXT: s_and_b32 s40, s66, 0xff ; SI-NEXT: s_lshl_b32 s40, s40, 16 -; SI-NEXT: s_lshl_b32 s42, s84, 24 +; SI-NEXT: s_lshl_b32 s42, s85, 24 ; SI-NEXT: s_or_b32 s40, s42, s40 -; SI-NEXT: s_and_b32 s42, s35, 0xff -; SI-NEXT: s_lshl_b32 s43, s97, 8 +; SI-NEXT: s_and_b32 s42, s31, 0xff +; SI-NEXT: s_lshl_b32 s43, s36, 8 ; SI-NEXT: s_or_b32 s43, s42, s43 -; SI-NEXT: s_and_b32 s42, s71, 0xff +; SI-NEXT: s_and_b32 s42, s81, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s67, 24 -; SI-NEXT: s_or_b32 s35, s76, s42 -; SI-NEXT: s_and_b32 s42, s87, 0xff +; SI-NEXT: s_lshl_b32 s76, s80, 24 +; SI-NEXT: s_or_b32 s21, s76, s42 +; SI-NEXT: s_and_b32 s42, s83, 0xff ; SI-NEXT: s_lshl_b32 s42, s42, 16 -; SI-NEXT: s_lshl_b32 s76, s83, 24 +; SI-NEXT: s_lshl_b32 s76, s23, 24 ; SI-NEXT: s_or_b32 s42, s76, s42 -; SI-NEXT: s_and_b32 s76, s19, 0xff -; SI-NEXT: s_lshl_b32 s77, s20, 8 +; SI-NEXT: s_and_b32 s76, s20, 0xff +; SI-NEXT: s_lshl_b32 s77, s17, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s66, 0xff -; SI-NEXT: v_writelane_b32 v42, s78, 52 +; SI-NEXT: s_and_b32 s77, s34, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s85, 24 -; SI-NEXT: s_or_b32 s19, s78, s77 -; SI-NEXT: s_and_b32 s77, s94, 0xff +; SI-NEXT: s_lshl_b32 s78, s86, 24 +; SI-NEXT: s_or_b32 s17, s78, s77 +; SI-NEXT: s_and_b32 s77, s26, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s27, 24 +; SI-NEXT: s_lshl_b32 s78, s94, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff ; SI-NEXT: s_or_b32 vcc_lo, s78, s77 -; SI-NEXT: s_or_b32 vcc_hi, s76, s19 -; SI-NEXT: s_and_b32 s76, s26, 0xff -; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 vcc_hi, s76, s17 +; SI-NEXT: s_and_b32 s76, s87, 0xff +; SI-NEXT: s_lshl_b32 s77, s19, 8 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s98, 0xff +; SI-NEXT: s_and_b32 s77, s35, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_or_b32 s71, s78, s77 -; SI-NEXT: s_and_b32 s77, s79, 0xff +; SI-NEXT: s_and_b32 s77, s27, 0xff ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 40 -; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: v_readlane_b32 s16, v43, 36 +; SI-NEXT: s_and_b32 s43, s43, 0xffff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_lshl_b32 s78, s79, 24 ; SI-NEXT: s_or_b32 s39, s76, s71 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 39 -; SI-NEXT: s_or_b32 s41, s41, s18 -; SI-NEXT: s_mov_b32 s31, s18 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: s_or_b32 s43, s43, s21 +; SI-NEXT: s_mov_b32 s23, s21 ; SI-NEXT: s_or_b32 s38, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 38 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s21, v43, 34 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 37 +; SI-NEXT: s_and_b32 s77, s21, 0xff +; SI-NEXT: v_readlane_b32 s21, v43, 33 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s21, 24 ; SI-NEXT: s_or_b32 s80, s78, s77 -; SI-NEXT: s_and_b32 s77, s95, 0xff -; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 36 +; SI-NEXT: s_and_b32 s77, s93, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s93, 24 -; SI-NEXT: s_or_b32 s49, s76, s80 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_lshl_b32 s78, s95, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff ; SI-NEXT: s_or_b32 s48, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 34 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s91, 0xff +; SI-NEXT: s_lshl_b32 s77, s90, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 32 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 33 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 31 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_or_b32 s81, s78, s77 -; SI-NEXT: s_and_b32 s77, s30, 0xff +; SI-NEXT: s_and_b32 s77, s37, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s69, 24 +; SI-NEXT: s_lshl_b32 s78, s68, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 31 ; SI-NEXT: s_or_b32 s50, s78, s77 ; SI-NEXT: s_or_b32 s51, s76, s81 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_and_b32 s76, s92, 0xff ; SI-NEXT: s_lshl_b32 s77, s96, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 30 +; SI-NEXT: v_readlane_b32 s16, v43, 30 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: s_and_b32 s77, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 ; SI-NEXT: s_lshl_b32 s78, s82, 24 -; SI-NEXT: v_writelane_b32 v42, s96, 53 -; SI-NEXT: v_readlane_b32 s18, v43, 32 -; SI-NEXT: v_writelane_b32 v42, s82, 54 +; SI-NEXT: v_writelane_b32 v42, s96, 57 +; SI-NEXT: v_writelane_b32 v42, s82, 58 ; SI-NEXT: s_or_b32 s82, s78, s77 ; SI-NEXT: s_and_b32 s77, s18, 0xff ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 28 +; SI-NEXT: v_readlane_b32 s16, v43, 29 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s70, 24 +; SI-NEXT: s_lshl_b32 s78, s69, 24 ; SI-NEXT: s_or_b32 s53, s76, s82 -; SI-NEXT: s_and_b32 s76, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 27 +; SI-NEXT: s_and_b32 s76, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 28 ; SI-NEXT: s_or_b32 s52, s78, s77 -; SI-NEXT: s_lshl_b32 s77, s17, 8 -; SI-NEXT: v_readlane_b32 s18, v43, 26 +; SI-NEXT: s_lshl_b32 s77, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 27 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s18, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 25 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 26 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_writelane_b32 v42, s16, 55 -; SI-NEXT: s_or_b32 s16, s78, s77 -; SI-NEXT: s_and_b32 s77, s89, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 29 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_writelane_b32 v42, s84, 59 +; SI-NEXT: s_or_b32 s84, s78, s77 +; SI-NEXT: s_and_b32 s77, s70, 0xff ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s89, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 22 -; SI-NEXT: v_readlane_b32 s18, v43, 21 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: v_readlane_b32 s18, v43, 22 ; SI-NEXT: s_or_b32 s54, s78, s77 -; SI-NEXT: s_or_b32 s55, s76, s16 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_or_b32 s55, s76, s84 +; SI-NEXT: s_and_b32 s76, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s18, 8 -; SI-NEXT: v_readlane_b32 s17, v43, 20 +; SI-NEXT: v_readlane_b32 s16, v43, 21 ; SI-NEXT: s_or_b32 s76, s76, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 19 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 20 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 -; SI-NEXT: v_readlane_b32 s17, v43, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 +; SI-NEXT: v_readlane_b32 s16, v43, 25 ; SI-NEXT: s_or_b32 s83, s78, s77 -; SI-NEXT: s_and_b32 s77, s17, 0xff -; SI-NEXT: v_readlane_b32 s17, v43, 23 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: v_readlane_b32 s17, v43, 17 +; SI-NEXT: v_readlane_b32 s16, v43, 17 ; SI-NEXT: v_readlane_b32 s18, v43, 16 ; SI-NEXT: s_or_b32 s64, s78, s77 ; SI-NEXT: s_or_b32 s65, s76, s83 -; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_and_b32 s76, s16, 0xff ; SI-NEXT: s_lshl_b32 s77, s18, 8 ; SI-NEXT: v_readlane_b32 s18, v43, 15 ; SI-NEXT: s_or_b32 s76, s76, s77 ; SI-NEXT: s_and_b32 s77, s18, 0xff ; SI-NEXT: v_readlane_b32 s18, v43, 14 -; SI-NEXT: v_writelane_b32 v42, s89, 56 ; SI-NEXT: s_lshl_b32 s77, s77, 16 ; SI-NEXT: s_lshl_b32 s78, s18, 24 -; SI-NEXT: v_writelane_b32 v42, s70, 57 +; SI-NEXT: v_readlane_b32 s16, v43, 19 ; SI-NEXT: s_or_b32 s85, s78, s77 -; SI-NEXT: s_and_b32 s77, s44, 0xff -; SI-NEXT: v_readlane_b32 s18, v43, 18 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: s_and_b32 s13, s13, 0xffff -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: s_and_b32 s43, s43, 0xffff -; SI-NEXT: v_writelane_b32 v42, s69, 58 +; SI-NEXT: s_and_b32 s77, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 +; SI-NEXT: v_writelane_b32 v42, s89, 60 ; SI-NEXT: s_lshl_b32 s77, s77, 16 -; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_lshl_b32 s78, s16, 24 ; SI-NEXT: s_and_b32 s76, s76, 0xffff -; SI-NEXT: s_and_b32 s44, s29, 0xffff -; SI-NEXT: s_or_b32 s7, s7, s37 -; SI-NEXT: s_or_b32 s9, s9, s68 -; SI-NEXT: s_or_b32 s11, s11, s99 -; SI-NEXT: s_or_b32 s13, s13, s92 -; SI-NEXT: s_or_b32 s15, s15, s90 -; SI-NEXT: s_or_b32 s43, s43, s35 -; SI-NEXT: v_writelane_b32 v42, s30, 59 -; SI-NEXT: s_mov_b32 s23, s91 -; SI-NEXT: s_mov_b32 s91, s36 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_writelane_b32 v42, s70, 61 ; SI-NEXT: s_or_b32 s66, s78, s77 ; SI-NEXT: s_or_b32 s67, s76, s85 -; SI-NEXT: s_and_b32 s45, s45, 0xffff -; SI-NEXT: s_and_b32 s46, s46, 0xffff -; SI-NEXT: s_and_b32 s47, s47, 0xffff -; SI-NEXT: s_and_b32 s56, s56, 0xffff -; SI-NEXT: s_and_b32 s57, s57, 0xffff -; SI-NEXT: s_and_b32 s30, s58, 0xffff -; SI-NEXT: s_and_b32 s34, s59, 0xffff -; SI-NEXT: s_and_b32 s36, s60, 0xffff -; SI-NEXT: s_and_b32 s97, s61, 0xffff -; SI-NEXT: s_and_b32 s86, s62, 0xffff -; SI-NEXT: s_and_b32 s98, s63, 0xffff -; SI-NEXT: s_and_b32 s17, s72, 0xffff -; SI-NEXT: s_and_b32 s87, s73, 0xffff ; SI-NEXT: s_and_b32 s96, s74, 0xffff ; SI-NEXT: s_and_b32 s22, s75, 0xffff ; SI-NEXT: s_or_b32 s74, s44, s4 ; SI-NEXT: s_mov_b32 s75, s5 ; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 -; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 -; SI-NEXT: s_mov_b32 s70, s93 -; SI-NEXT: s_mov_b32 s69, s95 -; SI-NEXT: s_mov_b32 s93, s28 -; SI-NEXT: s_or_b32 s72, s45, s6 -; SI-NEXT: s_mov_b32 s73, s7 -; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[4:5], vcc, 16 +; SI-NEXT: v_writelane_b32 v42, s4, 50 +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s87, s73, 0xffff +; SI-NEXT: v_writelane_b32 v42, s5, 51 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_and_b32 s86, s62, 0xffff +; SI-NEXT: s_and_b32 s98, s63, 0xffff +; SI-NEXT: s_and_b32 s21, s72, 0xffff ; SI-NEXT: s_or_b32 s62, s46, s8 ; SI-NEXT: s_mov_b32 s63, s9 ; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_or_b32 s8, s87, s54 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: v_readlane_b32 s16, v42, 52 +; SI-NEXT: s_or_b32 s13, s13, s99 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff +; SI-NEXT: s_and_b32 s97, s61, 0xffff ; SI-NEXT: s_or_b32 s60, s47, s10 ; SI-NEXT: s_mov_b32 s61, s11 -; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[10:11], 16 +; SI-NEXT: s_or_b32 s10, s21, s52 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_lshr_b32 s55, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 53 +; SI-NEXT: s_or_b32 s15, s15, s24 +; SI-NEXT: s_or_b32 s41, s41, s25 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_and_b32 s30, s58, 0xffff +; SI-NEXT: s_and_b32 s34, s59, 0xffff ; SI-NEXT: s_or_b32 s58, s56, s12 ; SI-NEXT: s_mov_b32 s59, s13 -; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_or_b32 s12, s98, s50 +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_lshr_b32 s53, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 54 +; SI-NEXT: s_mov_b32 s70, s69 +; SI-NEXT: s_mov_b32 s69, s68 +; SI-NEXT: s_mov_b32 s68, s37 +; SI-NEXT: s_mov_b32 s37, s95 +; SI-NEXT: s_or_b32 s72, s45, s6 +; SI-NEXT: s_mov_b32 s73, s7 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 ; SI-NEXT: s_or_b32 s56, s57, s14 ; SI-NEXT: s_mov_b32 s57, s15 -; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 ; SI-NEXT: s_or_b32 s46, s30, s40 ; SI-NEXT: s_mov_b32 s47, s41 -; SI-NEXT: s_or_b32 s44, s34, s42 -; SI-NEXT: s_mov_b32 s34, s4 -; SI-NEXT: s_mov_b32 s45, s43 -; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 -; SI-NEXT: s_or_b32 s42, s36, vcc_lo -; SI-NEXT: s_mov_b32 s43, vcc_hi -; SI-NEXT: s_lshr_b64 vcc, vcc, 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[40:41], 16 ; SI-NEXT: s_or_b32 s40, s97, s38 ; SI-NEXT: s_mov_b32 s41, s39 ; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 ; SI-NEXT: s_or_b32 s14, s86, s48 ; SI-NEXT: s_mov_b32 s15, s49 ; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 -; SI-NEXT: s_or_b32 s12, s98, s50 -; SI-NEXT: s_mov_b32 s13, s51 -; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 -; SI-NEXT: s_or_b32 s10, s17, s52 -; SI-NEXT: s_mov_b32 s11, s53 -; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 -; SI-NEXT: s_or_b32 s8, s87, s54 -; SI-NEXT: s_mov_b32 s9, s55 -; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 ; SI-NEXT: s_or_b32 s6, s96, s64 ; SI-NEXT: s_mov_b32 s7, s65 ; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 ; SI-NEXT: s_or_b32 s4, s22, s66 ; SI-NEXT: s_mov_b32 s5, s67 ; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 -; SI-NEXT: v_readlane_b32 s17, v42, 51 -; SI-NEXT: s_lshr_b32 s55, s17, 16 -; SI-NEXT: s_lshr_b32 s53, s37, 16 -; SI-NEXT: s_lshr_b32 s51, s68, 16 -; SI-NEXT: s_lshr_b32 s49, s99, 16 -; SI-NEXT: s_lshr_b32 s86, s92, 16 -; SI-NEXT: s_lshr_b32 s39, s90, 16 -; SI-NEXT: s_lshr_b32 s18, s31, 16 -; SI-NEXT: s_lshr_b32 s22, s35, 16 -; SI-NEXT: s_lshr_b32 s97, s19, 16 +; SI-NEXT: s_lshr_b32 s51, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 55 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[34:35], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b32 s49, s16, 16 +; SI-NEXT: s_lshr_b32 s86, s99, 16 +; SI-NEXT: s_lshr_b32 s39, s24, 16 +; SI-NEXT: s_lshr_b32 s18, s25, 16 +; SI-NEXT: s_lshr_b32 s22, s23, 16 +; SI-NEXT: s_lshr_b32 s97, s17, 16 ; SI-NEXT: s_lshr_b32 s65, s71, 16 -; SI-NEXT: s_lshr_b32 s19, s80, 16 -; SI-NEXT: s_lshr_b32 s71, s81, 16 +; SI-NEXT: s_lshr_b32 s71, s80, 16 +; SI-NEXT: s_lshr_b32 s21, s81, 16 ; SI-NEXT: s_lshr_b32 s67, s82, 16 -; SI-NEXT: v_readlane_b32 s82, v42, 54 -; SI-NEXT: v_readlane_b32 s96, v42, 53 -; SI-NEXT: s_lshr_b32 s80, s16, 16 -; SI-NEXT: v_readlane_b32 s16, v42, 55 +; SI-NEXT: v_readlane_b32 s82, v42, 58 +; SI-NEXT: v_readlane_b32 s96, v42, 57 +; SI-NEXT: s_lshr_b32 s80, s84, 16 +; SI-NEXT: v_readlane_b32 s84, v42, 59 ; SI-NEXT: s_lshr_b32 s81, s83, 16 -; SI-NEXT: s_mov_b32 s90, s93 -; SI-NEXT: v_readlane_b32 s78, v42, 52 -; SI-NEXT: s_mov_b32 s95, s69 -; SI-NEXT: s_mov_b32 s93, s70 -; SI-NEXT: v_readlane_b32 s30, v42, 59 -; SI-NEXT: v_readlane_b32 s69, v42, 58 -; SI-NEXT: v_readlane_b32 s70, v42, 57 -; SI-NEXT: v_readlane_b32 s89, v42, 56 +; SI-NEXT: s_mov_b32 s95, s37 +; SI-NEXT: s_mov_b32 s37, s68 +; SI-NEXT: s_mov_b32 s68, s69 +; SI-NEXT: s_mov_b32 s69, s70 +; SI-NEXT: v_readlane_b32 s70, v42, 61 +; SI-NEXT: v_readlane_b32 s89, v42, 60 ; SI-NEXT: s_lshr_b32 s77, s85, 16 -; SI-NEXT: s_mov_b32 s84, vcc_lo -; SI-NEXT: s_mov_b32 s36, s91 -; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: v_readlane_b32 s24, v42, 56 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: v_readlane_b32 s4, v43, 42 +; SI-NEXT: v_readlane_b32 s4, v43, 41 ; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s6, v43, 41 +; SI-NEXT: v_readlane_b32 s6, v43, 19 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_lshl_b32 s5, s24, 8 ; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_readlane_b32 s5, v43, 18 @@ -205631,14 +205637,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_readlane_b32 s6, v43, 44 +; SI-NEXT: v_readlane_b32 s6, v43, 43 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 24 +; SI-NEXT: v_readlane_b32 s7, v43, 39 +; SI-NEXT: v_readlane_b32 s8, v43, 25 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_lshl_b32 s7, s7, 8 ; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: v_readlane_b32 s7, v43, 24 ; SI-NEXT: s_and_b32 s8, s8, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 @@ -205646,15 +205653,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: v_readlane_b32 s7, v43, 23 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_readlane_b32 s8, v43, 21 -; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: v_readlane_b32 s8, v43, 22 +; SI-NEXT: v_readlane_b32 s9, v43, 21 ; SI-NEXT: s_and_b32 s7, s7, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 8 ; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 19 +; SI-NEXT: v_readlane_b32 s8, v43, 20 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s8, s8, 24 ; SI-NEXT: s_lshl_b32 s9, s9, 16 @@ -205662,29 +205669,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s7, s7, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 -; SI-NEXT: v_readlane_b32 s8, v43, 43 +; SI-NEXT: v_readlane_b32 s8, v43, 42 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_readlane_b32 s9, v43, 38 ; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s9, s78, 8 -; SI-NEXT: s_add_i32 s10, s89, 3 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_add_i32 s10, s70, 3 ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 29 ; SI-NEXT: s_and_b32 s10, s10, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s9, s89, 24 ; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_addk_i32 s8, 0x300 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: s_or_b32 s8, s9, s8 -; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: v_readlane_b32 s9, v43, 29 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_readlane_b32 s10, v43, 27 -; SI-NEXT: v_readlane_b32 s11, v43, 26 +; SI-NEXT: v_readlane_b32 s10, v43, 28 +; SI-NEXT: v_readlane_b32 s11, v43, 27 ; SI-NEXT: s_and_b32 s9, s9, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 8 ; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 25 +; SI-NEXT: v_readlane_b32 s10, v43, 26 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s10, s10, 24 ; SI-NEXT: s_lshl_b32 s11, s11, 16 @@ -205692,23 +205699,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: s_and_b32 s9, s9, 0xffff ; SI-NEXT: s_or_b32 s9, s10, s9 -; SI-NEXT: v_readlane_b32 s10, v43, 46 +; SI-NEXT: v_readlane_b32 s10, v43, 45 ; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_readlane_b32 s11, v43, 45 -; SI-NEXT: v_readlane_b32 s12, v43, 32 +; SI-NEXT: v_readlane_b32 s11, v43, 44 +; SI-NEXT: v_readlane_b32 s12, v43, 40 ; SI-NEXT: s_and_b32 s10, s10, 0xff ; SI-NEXT: s_lshl_b32 s11, s11, 8 ; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s11, s70, 24 +; SI-NEXT: s_lshl_b32 s11, s69, 24 ; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_addk_i32 s10, 0x300 ; SI-NEXT: s_or_b32 s11, s11, s12 ; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: s_or_b32 s10, s11, s10 -; SI-NEXT: v_readlane_b32 s11, v43, 31 -; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_add_i32 s11, s92, 3 ; SI-NEXT: v_readlane_b32 s13, v43, 30 ; SI-NEXT: s_and_b32 s11, s11, 0xff ; SI-NEXT: s_lshl_b32 s12, s96, 8 @@ -205721,27 +205727,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: s_and_b32 s11, s11, 0xffff ; SI-NEXT: s_or_b32 s11, s12, s11 -; SI-NEXT: s_add_i32 s12, s36, 3 +; SI-NEXT: v_readlane_b32 s12, v43, 37 +; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_and_b32 s12, s12, 0xff -; SI-NEXT: s_lshl_b32 s13, s16, 8 -; SI-NEXT: s_add_i32 s14, s30, 3 +; SI-NEXT: s_lshl_b32 s13, s84, 8 +; SI-NEXT: s_add_i32 s14, s37, 3 ; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: s_and_b32 s14, s14, 0xff -; SI-NEXT: s_lshl_b32 s13, s69, 24 +; SI-NEXT: s_lshl_b32 s13, s68, 24 ; SI-NEXT: s_lshl_b32 s14, s14, 16 ; SI-NEXT: s_addk_i32 s12, 0x300 ; SI-NEXT: s_or_b32 s13, s13, s14 ; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: s_or_b32 s12, s13, s12 -; SI-NEXT: v_readlane_b32 s13, v43, 36 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_readlane_b32 s14, v43, 35 -; SI-NEXT: v_readlane_b32 s15, v43, 34 +; SI-NEXT: s_add_i32 s13, s91, 3 +; SI-NEXT: v_readlane_b32 s15, v43, 32 ; SI-NEXT: s_and_b32 s13, s13, 0xff -; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_lshl_b32 s14, s90, 8 ; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: v_readlane_b32 s14, v43, 31 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s14, s14, 24 ; SI-NEXT: s_lshl_b32 s15, s15, 16 @@ -205749,29 +205754,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: s_and_b32 s13, s13, 0xffff ; SI-NEXT: s_or_b32 s13, s14, s13 -; SI-NEXT: v_readlane_b32 s14, v42, 50 +; SI-NEXT: v_readlane_b32 s14, v42, 49 ; SI-NEXT: s_add_i32 s17, s14, 3 -; SI-NEXT: v_readlane_b32 s15, v42, 49 +; SI-NEXT: v_readlane_b32 s15, v42, 48 ; SI-NEXT: s_and_b32 s14, s17, 0xff ; SI-NEXT: s_lshl_b32 s15, s15, 8 -; SI-NEXT: s_add_i32 s16, s95, 3 +; SI-NEXT: s_add_i32 s16, s93, 3 ; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: s_and_b32 s16, s16, 0xff -; SI-NEXT: s_lshl_b32 s15, s93, 24 +; SI-NEXT: s_lshl_b32 s15, s95, 24 ; SI-NEXT: s_lshl_b32 s16, s16, 16 ; SI-NEXT: s_addk_i32 s14, 0x300 ; SI-NEXT: s_or_b32 s15, s15, s16 ; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: s_or_b32 s14, s15, s14 -; SI-NEXT: v_readlane_b32 s15, v43, 40 +; SI-NEXT: v_readlane_b32 s15, v43, 36 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: v_readlane_b32 s16, v43, 39 -; SI-NEXT: v_readlane_b32 s17, v43, 38 +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: v_readlane_b32 s17, v43, 34 ; SI-NEXT: s_and_b32 s15, s15, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 8 ; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: v_readlane_b32 s16, v43, 33 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s16, s16, 24 ; SI-NEXT: s_lshl_b32 s17, s17, 16 @@ -205779,15 +205784,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: s_and_b32 s15, s15, 0xffff ; SI-NEXT: s_or_b32 s15, s16, s15 -; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: v_readlane_b32 s16, v42, 47 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 47 -; SI-NEXT: v_readlane_b32 s18, v42, 42 +; SI-NEXT: v_readlane_b32 s17, v42, 46 +; SI-NEXT: v_readlane_b32 s18, v42, 41 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s99, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s17, v42, 42 ; SI-NEXT: s_and_b32 s18, s99, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -205795,15 +205800,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 38 +; SI-NEXT: v_readlane_b32 s17, v42, 37 ; SI-NEXT: s_add_i32 s87, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 35 -; SI-NEXT: v_readlane_b32 s19, v42, 29 +; SI-NEXT: v_readlane_b32 s18, v42, 34 +; SI-NEXT: v_readlane_b32 s19, v42, 28 ; SI-NEXT: s_and_b32 s17, s87, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_add_i32 s23, s19, 3 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 30 +; SI-NEXT: v_readlane_b32 s18, v42, 29 ; SI-NEXT: s_and_b32 s23, s23, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 24 ; SI-NEXT: s_lshl_b32 s23, s23, 16 @@ -205812,16 +205817,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s17, s17, 0xffff ; SI-NEXT: s_or_b32 s17, s18, s17 ; SI-NEXT: s_add_i32 s40, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 45 +; SI-NEXT: v_readlane_b32 s16, v42, 44 ; SI-NEXT: s_add_i32 s41, s17, 0x3000000 ; SI-NEXT: s_add_i32 s68, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 44 -; SI-NEXT: v_readlane_b32 s18, v42, 39 +; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: v_readlane_b32 s18, v42, 38 ; SI-NEXT: s_and_b32 s16, s68, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s96, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 40 +; SI-NEXT: v_readlane_b32 s17, v42, 39 ; SI-NEXT: s_and_b32 s18, s96, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 @@ -205829,33 +205834,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 33 +; SI-NEXT: v_readlane_b32 s17, v42, 32 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_readlane_b32 s18, v42, 31 +; SI-NEXT: v_readlane_b32 s18, v42, 30 ; SI-NEXT: s_and_b32 s17, s17, 0xff ; SI-NEXT: s_lshl_b32 s18, s18, 8 ; SI-NEXT: s_or_b32 s17, s18, s17 -; SI-NEXT: v_readlane_b32 s18, v42, 24 +; SI-NEXT: v_readlane_b32 s18, v42, 23 ; SI-NEXT: s_addk_i32 s17, 0x300 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s42, s16, 0x3000000 ; SI-NEXT: s_and_b32 s16, s17, 0xffff -; SI-NEXT: v_readlane_b32 s17, v42, 25 +; SI-NEXT: v_readlane_b32 s17, v42, 24 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 24 ; SI-NEXT: s_lshl_b32 s18, s18, 16 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s43, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 46 +; SI-NEXT: v_readlane_b32 s16, v42, 45 ; SI-NEXT: s_add_i32 s23, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 41 -; SI-NEXT: v_readlane_b32 s18, v42, 37 +; SI-NEXT: v_readlane_b32 s17, v42, 40 +; SI-NEXT: v_readlane_b32 s18, v42, 36 ; SI-NEXT: s_and_b32 s16, s23, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s86, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: v_readlane_b32 s17, v42, 35 ; SI-NEXT: s_and_b32 s18, s86, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205864,15 +205869,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s44, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 28 +; SI-NEXT: v_readlane_b32 s16, v42, 27 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 22 -; SI-NEXT: v_readlane_b32 s18, v42, 16 +; SI-NEXT: v_readlane_b32 s17, v42, 21 +; SI-NEXT: v_readlane_b32 s18, v42, 15 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: v_readlane_b32 s17, v42, 14 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205881,15 +205886,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s45, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 34 +; SI-NEXT: v_readlane_b32 s16, v42, 33 ; SI-NEXT: s_add_i32 s83, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 32 -; SI-NEXT: v_readlane_b32 s18, v42, 26 +; SI-NEXT: v_readlane_b32 s17, v42, 31 +; SI-NEXT: v_readlane_b32 s18, v42, 25 ; SI-NEXT: s_and_b32 s16, s83, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 27 +; SI-NEXT: v_readlane_b32 s17, v42, 26 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205898,15 +205903,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s46, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 23 +; SI-NEXT: v_readlane_b32 s16, v42, 22 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 21 -; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: v_readlane_b32 s17, v42, 20 +; SI-NEXT: v_readlane_b32 s18, v42, 16 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s17, v42, 17 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205915,15 +205920,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s47, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 20 +; SI-NEXT: v_readlane_b32 s16, v42, 19 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 19 -; SI-NEXT: v_readlane_b32 s18, v42, 13 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: v_readlane_b32 s18, v42, 12 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: v_readlane_b32 s17, v42, 13 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205932,15 +205937,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s56, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 12 +; SI-NEXT: v_readlane_b32 s16, v42, 11 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 11 -; SI-NEXT: v_readlane_b32 s18, v42, 7 +; SI-NEXT: v_readlane_b32 s17, v42, 10 +; SI-NEXT: v_readlane_b32 s18, v42, 6 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s17, v42, 7 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205949,15 +205954,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s57, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 10 +; SI-NEXT: v_readlane_b32 s16, v42, 9 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 9 -; SI-NEXT: v_readlane_b32 s18, v42, 5 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: v_readlane_b32 s18, v42, 4 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 6 +; SI-NEXT: v_readlane_b32 s17, v42, 5 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205966,15 +205971,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s58, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: v_readlane_b32 s16, v42, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 3 -; SI-NEXT: v_readlane_b32 s18, v43, 63 +; SI-NEXT: v_readlane_b32 s17, v42, 2 +; SI-NEXT: v_readlane_b32 s18, v43, 62 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s17, v43, 63 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -205983,15 +205988,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s59, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v42, 2 +; SI-NEXT: v_readlane_b32 s16, v42, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v42, 1 -; SI-NEXT: v_readlane_b32 s18, v43, 61 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 60 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 62 +; SI-NEXT: v_readlane_b32 s17, v43, 61 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206000,15 +206005,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s60, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 60 +; SI-NEXT: v_readlane_b32 s16, v43, 59 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 59 -; SI-NEXT: v_readlane_b32 s18, v43, 55 +; SI-NEXT: v_readlane_b32 s17, v43, 58 +; SI-NEXT: v_readlane_b32 s18, v43, 54 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s17, v43, 55 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206017,15 +206022,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s61, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 58 +; SI-NEXT: v_readlane_b32 s16, v43, 57 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 57 -; SI-NEXT: v_readlane_b32 s18, v43, 53 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: v_readlane_b32 s18, v43, 52 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 54 +; SI-NEXT: v_readlane_b32 s17, v43, 53 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206034,15 +206039,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s17, s17, s18 ; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: s_add_i32 s62, s16, 0x3000000 -; SI-NEXT: v_readlane_b32 s16, v43, 52 +; SI-NEXT: v_readlane_b32 s16, v43, 51 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_readlane_b32 s17, v43, 51 -; SI-NEXT: v_readlane_b32 s18, v43, 49 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s18, v43, 48 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: v_readlane_b32 s17, v43, 49 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206071,12 +206076,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s16, v43, 1 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: v_readlane_b32 s17, v43, 0 -; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: v_readlane_b32 s18, v43, 46 ; SI-NEXT: s_and_b32 s16, s16, 0xff ; SI-NEXT: s_lshl_b32 s17, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s16, s17, s16 -; SI-NEXT: v_readlane_b32 s17, v43, 48 +; SI-NEXT: v_readlane_b32 s17, v43, 47 ; SI-NEXT: s_and_b32 s18, s18, 0xff ; SI-NEXT: s_addk_i32 s16, 0x300 ; SI-NEXT: s_lshl_b32 s17, s17, 24 @@ -206132,6 +206137,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s15, s15, 0x3000000 ; SI-NEXT: s_add_i32 s75, s16, 0x3000000 ; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 ; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 ; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 ; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 @@ -206141,12 +206147,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 ; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16 ; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16 -; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 -; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16 -; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16 -; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 -; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 -; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v42, s16, 50 ; SI-NEXT: s_lshr_b32 s55, s75, 16 ; SI-NEXT: s_lshr_b32 s53, s73, 16 ; SI-NEXT: s_lshr_b32 s51, s63, 16 @@ -206157,12 +206163,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshr_b32 s22, s45, 16 ; SI-NEXT: s_lshr_b32 s97, s43, 16 ; SI-NEXT: s_lshr_b32 s65, s41, 16 -; SI-NEXT: s_lshr_b32 s19, s15, 16 -; SI-NEXT: s_lshr_b32 s71, s13, 16 +; SI-NEXT: s_lshr_b32 s71, s15, 16 +; SI-NEXT: s_lshr_b32 s21, s13, 16 ; SI-NEXT: s_lshr_b32 s67, s11, 16 ; SI-NEXT: s_lshr_b32 s80, s9, 16 ; SI-NEXT: s_lshr_b32 s81, s7, 16 ; SI-NEXT: s_lshr_b32 s77, s5, 16 +; SI-NEXT: v_writelane_b32 v42, s17, 51 ; SI-NEXT: .LBB97_3: ; %end ; SI-NEXT: s_and_b32 s16, s74, 0xffff ; SI-NEXT: s_lshl_b32 s17, s76, 16 @@ -206203,7 +206210,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s60, 0xffff -; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: s_lshl_b32 s17, s78, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206217,7 +206224,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_lshl_b32 s17, s20, 16 +; SI-NEXT: s_lshl_b32 s17, s88, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206231,7 +206238,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s56, 0xffff -; SI-NEXT: s_lshl_b32 s17, s24, 16 +; SI-NEXT: s_lshl_b32 s17, s94, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206245,7 +206252,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s46, 0xffff -; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: s_lshl_b32 s17, s30, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206259,7 +206266,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s44, 0xffff -; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: s_lshl_b32 s17, s34, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206269,11 +206276,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s17, s22, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 50 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s16 ; SI-NEXT: s_and_b32 s16, s42, 0xffff -; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 ; SI-NEXT: s_or_b32 s16, s16, s17 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206308,7 +206316,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s14 ; SI-NEXT: s_and_b32 s14, s15, 0xffff -; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: s_lshl_b32 s15, s71, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 ; SI-NEXT: s_or_b32 s14, s14, s15 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206322,7 +206330,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s12 ; SI-NEXT: s_and_b32 s12, s13, 0xffff -; SI-NEXT: s_lshl_b32 s13, s71, 16 +; SI-NEXT: s_lshl_b32 s13, s21, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s12, s12, s13 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen @@ -206386,6 +206394,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s19, v42, 51 ; SI-NEXT: v_readlane_b32 s99, v41, 35 ; SI-NEXT: v_readlane_b32 s98, v41, 34 ; SI-NEXT: v_readlane_b32 s97, v41, 33 @@ -206430,6 +206439,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v42, s4, 50 ; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr55 @@ -206440,32 +206451,32 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: v_writelane_b32 v42, s5, 51 ; SI-NEXT: ; implicit-def: $sgpr97 ; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr65 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr67 @@ -213474,24 +213485,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill @@ -213512,10 +213505,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr44 ; GFX9-NEXT: ; kill: killed $vgpr50 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr35 ; GFX9-NEXT: ; implicit-def: $vgpr34 ; GFX9-NEXT: ; implicit-def: $vgpr63 @@ -213523,7 +213515,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr39 ; GFX9-NEXT: ; implicit-def: $vgpr38 ; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; implicit-def: $vgpr49 @@ -213532,13 +213523,33 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr48 ; GFX9-NEXT: ; implicit-def: $vgpr62 ; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 ; GFX9-NEXT: ; implicit-def: $vgpr52 ; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 ; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill @@ -213546,9 +213557,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -213572,7 +213586,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: s_waitcnt vmcnt(43) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 @@ -213631,180 +213645,195 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(62) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(62) +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -213814,9 +213843,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 ; GFX9-NEXT: .LBB98_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v43, v50 +; GFX9-NEXT: v_mov_b32_e32 v50, v40 +; GFX9-NEXT: v_mov_b32_e32 v40, v55 +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB98_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] @@ -213828,12 +213861,36 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] @@ -213865,164 +213922,149 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 ; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 ; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 ; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v1 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 ; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v15 ; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 @@ -214039,41 +214081,50 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 ; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 ; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214082,84 +214133,103 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 ; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -214169,16 +214239,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -214188,23 +214258,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 ; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 @@ -214212,14 +214275,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -214228,11 +214295,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214241,10 +214308,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214254,11 +214321,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214267,10 +214334,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214280,11 +214347,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214293,10 +214360,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214306,11 +214373,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -214319,10 +214386,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -214332,53 +214399,34 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 ; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 ; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 ; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 ; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -241104,75 +241152,76 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v14 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v23, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v49 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v21, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v41 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v59 ; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v61 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v6, v62 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v49, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v60 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 @@ -241181,18 +241230,17 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:116 ; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:120 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v33 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v50, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v6 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v41, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v10 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 @@ -241213,23 +241261,33 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v63, v49 +; SI-NEXT: v_mov_b32_e32 v49, v54 +; SI-NEXT: v_mov_b32_e32 v54, v41 +; SI-NEXT: v_mov_b32_e32 v41, v9 +; SI-NEXT: v_mov_b32_e32 v56, v5 +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v5, v34 +; SI-NEXT: v_mov_b32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v59, v29 -; SI-NEXT: v_mov_b32_e32 v29, v27 -; SI-NEXT: v_mov_b32_e32 v57, v23 +; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_mov_b32_e32 v57, v15 ; SI-NEXT: v_mov_b32_e32 v60, v3 ; SI-NEXT: v_mov_b32_e32 v62, v4 -; SI-NEXT: v_mov_b32_e32 v63, v49 -; SI-NEXT: v_mov_b32_e32 v49, v12 +; SI-NEXT: v_mov_b32_e32 v26, v14 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB108_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 @@ -241241,7 +241299,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -241252,21 +241310,18 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v6, v6, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; SI-NEXT: v_or_b32_e32 v14, v14, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 +; SI-NEXT: v_or_b32_e32 v61, v14, v37 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v36 ; SI-NEXT: v_or_b32_e32 v33, v33, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -241299,240 +241354,236 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_or_b32_e32 v61, v3, v37 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v58 -; SI-NEXT: v_or_b32_e32 v11, v11, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_or_b32_e32 v24, v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 -; SI-NEXT: v_or_b32_e32 v18, v18, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v3, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_or_b32_e32 v2, v2, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v52, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_or_b32_e32 v55, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_or_b32_e32 v43, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v47 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 ; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; SI-NEXT: v_or_b32_e32 v29, v29, v23 ; SI-NEXT: v_or_b32_e32 v38, v38, v47 ; SI-NEXT: v_or_b32_e32 v54, v54, v42 +; SI-NEXT: v_or_b32_e32 v49, v49, v51 ; SI-NEXT: v_or_b32_e32 v45, v45, v50 -; SI-NEXT: v_or_b32_e32 v41, v41, v30 ; SI-NEXT: v_or_b32_e32 v46, v46, v32 +; SI-NEXT: v_alignbit_b32 v50, v61, v50, 16 +; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v12 +; SI-NEXT: v_mov_b32_e32 v12, v3 +; SI-NEXT: v_mov_b32_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v11, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 +; SI-NEXT: v_or_b32_e32 v16, v16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v22 +; SI-NEXT: v_or_b32_e32 v21, v21, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; SI-NEXT: v_or_b32_e32 v24, v24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_or_b32_e32 v31, v31, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_or_b32_e32 v18, v18, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 +; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 ; SI-NEXT: v_alignbit_b32 v47, v16, v47, 16 ; SI-NEXT: v_alignbit_b32 v42, v11, v42, 16 -; SI-NEXT: v_alignbit_b32 v50, v14, v50, 16 -; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 -; SI-NEXT: v_alignbit_b32 v32, v6, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v34 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v53 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; SI-NEXT: v_or_b32_e32 v52, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v55 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v40 +; SI-NEXT: v_or_b32_e32 v55, v37, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v43 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v37 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v44 +; SI-NEXT: v_or_b32_e32 v43, v37, v39 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: v_alignbit_b32 v34, v43, v34, 16 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v4, v39, v1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_alignbit_b32 v1, v55, v1, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: v_or_b32_e32 v3, v37, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v4 +; SI-NEXT: v_or_b32_e32 v4, v37, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v39, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, v39, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 ; SI-NEXT: v_or_b32_e32 v62, v56, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v60 -; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v3 +; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_or_b32_e32 v60, v56, v39 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 +; SI-NEXT: v_or_b32_e32 v3, v41, v30 +; SI-NEXT: v_alignbit_b32 v5, v52, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v2, v9, 16 ; SI-NEXT: v_or_b32_e32 v57, v56, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v59 +; SI-NEXT: v_alignbit_b32 v41, v19, v39, 16 +; SI-NEXT: v_mov_b32_e32 v39, v3 ; SI-NEXT: v_alignbit_b32 v26, v31, v26, 16 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v29, v29, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_alignbit_b32 v30, v7, v30, 16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: v_or_b32_e32 v59, v56, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 +; SI-NEXT: v_alignbit_b32 v27, v21, v27, 16 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_or_b32_e32 v63, v56, v35 +; SI-NEXT: v_alignbit_b32 v56, v18, v37, 16 ; SI-NEXT: v_alignbit_b32 v35, v33, v35, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v3 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v3, v49, v51 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v19, v39, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v49, v18, v37, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v21, v27, 16 -; SI-NEXT: v_alignbit_b32 v51, v61, v51, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: .LBB108_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -241546,7 +241597,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v34, v37 ; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 @@ -241559,7 +241610,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v34, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v5 @@ -241572,7 +241623,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -241587,7 +241638,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241597,11 +241648,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241623,11 +241672,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241637,11 +241684,9 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -241669,23 +241714,23 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -241710,13 +241755,13 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 77df03dcdcd9f..09cc571f9de32 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -48006,33 +48006,32 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v25 @@ -48041,44 +48040,44 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v57, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 ; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -48088,47 +48087,34 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v54, v9 ; SI-NEXT: v_mov_b32_e32 v55, v11 ; SI-NEXT: v_mov_b32_e32 v41, v13 -; SI-NEXT: v_mov_b32_e32 v48, v5 +; SI-NEXT: v_mov_b32_e32 v48, v4 +; SI-NEXT: v_mov_b32_e32 v4, v3 +; SI-NEXT: v_mov_b32_e32 v3, v43 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 -; SI-NEXT: v_mov_b32_e32 v7, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 @@ -48147,6 +48133,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 @@ -48165,41 +48152,50 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v5, v38, v47 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_or_b32_e32 v9, v38, v47 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_or_b32_e32 v48, v39, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 ; SI-NEXT: v_or_b32_e32 v9, v37, v45 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill @@ -48210,35 +48206,37 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v38, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v14 ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v41, v39, v43 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v58 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 -; SI-NEXT: v_or_b32_e32 v55, v37, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v55, v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 ; SI-NEXT: v_or_b32_e32 v54, v25, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 -; SI-NEXT: v_or_b32_e32 v52, v37, v40 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_or_b32_e32 v52, v37, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 @@ -48249,22 +48247,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v49 ; SI-NEXT: v_or_b32_e32 v62, v25, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v9 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v61, v29, v28 +; SI-NEXT: v_or_b32_e32 v49, v29, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v60 -; SI-NEXT: v_or_b32_e32 v49, v21, v27 +; SI-NEXT: v_or_b32_e32 v38, v21, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 @@ -48278,6 +48276,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_or_b32_e32 v41, v39, v43 ; SI-NEXT: v_or_b32_e32 v39, v29, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 @@ -48291,14 +48290,19 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v21, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v14 ; SI-NEXT: v_or_b32_e32 v53, v7, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 ; SI-NEXT: v_or_b32_e32 v50, v25, v21 @@ -48306,18 +48310,17 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v35, v13, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 ; SI-NEXT: v_or_b32_e32 v16, v16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v61 ; SI-NEXT: v_or_b32_e32 v15, v15, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 ; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16 +; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 +; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 ; SI-NEXT: v_or_b32_e32 v22, v21, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 ; SI-NEXT: v_or_b32_e32 v24, v24, v21 @@ -48338,19 +48341,15 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 ; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16 ; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v14, v19, v42, 16 ; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16 +; SI-NEXT: v_mov_b32_e32 v58, v38 ; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 ; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 ; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16 -; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 -; SI-NEXT: v_mov_b32_e32 v60, v37 -; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -48359,7 +48358,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v37, v37, v38 ; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen @@ -48383,10 +48382,8 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -48417,7 +48414,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -48459,7 +48456,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -48476,7 +48473,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 @@ -48488,7 +48485,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index c9e5771240078..0d85c2076c0ed 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -51847,79 +51847,79 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:84 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v54, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v50, v47 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v58 -; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v31 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v32 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v59 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v34 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v35 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v35 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:100 @@ -51944,11 +51944,21 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_mov_b32_e32 v8, v60 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v47, v21 ; SI-NEXT: v_mov_b32_e32 v56, v17 ; SI-NEXT: v_mov_b32_e32 v57, v6 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v59, v33 +; SI-NEXT: v_mov_b32_e32 v59, v61 +; SI-NEXT: v_mov_b32_e32 v61, v29 +; SI-NEXT: v_mov_b32_e32 v29, v25 +; SI-NEXT: v_mov_b32_e32 v25, v18 +; SI-NEXT: v_mov_b32_e32 v21, v16 +; SI-NEXT: v_mov_b32_e32 v17, v1 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true @@ -51958,12 +51968,14 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 @@ -51973,35 +51985,36 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_or_b32_e32 v3, v3, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_or_b32_e32 v9, v9, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v62 -; SI-NEXT: v_or_b32_e32 v63, v6, v34 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_or_b32_e32 v1, v31, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v34 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_or_b32_e32 v31, v12, v34 +; SI-NEXT: v_mov_b32_e32 v12, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v16 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 @@ -52015,13 +52028,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v12, v12, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -52031,7 +52042,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v62, v18, v34 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v22, v22, v34 @@ -52043,8 +52054,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v2, v2, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 @@ -52056,79 +52068,89 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v48, v34, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_or_b32_e32 v52, v34, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v34 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_or_b32_e32 v55, v34, v35 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_or_b32_e32 v6, v35, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 @@ -52136,102 +52158,91 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v43 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 ; SI-NEXT: v_or_b32_e32 v25, v25, v24 ; SI-NEXT: v_or_b32_e32 v29, v29, v28 ; SI-NEXT: v_or_b32_e32 v54, v54, v51 ; SI-NEXT: v_or_b32_e32 v50, v50, v30 +; SI-NEXT: v_or_b32_e32 v33, v33, v42 ; SI-NEXT: v_or_b32_e32 v39, v39, v41 ; SI-NEXT: v_alignbit_b32 v60, v55, v34, 16 ; SI-NEXT: v_alignbit_b32 v24, v26, v24, 16 ; SI-NEXT: v_alignbit_b32 v28, v22, v28, 16 -; SI-NEXT: v_alignbit_b32 v51, v12, v51, 16 -; SI-NEXT: v_alignbit_b32 v30, v63, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v12, v30, 16 +; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 ; SI-NEXT: v_alignbit_b32 v41, v3, v41, 16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v35, v6 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_or_b32_e32 v6, v35, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_alignbit_b32 v1, v52, v1, 16 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_or_b32_e32 v58, v35, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 ; SI-NEXT: v_alignbit_b32 v8, v48, v8, 16 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_or_b32_e32 v57, v46, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 -; SI-NEXT: v_alignbit_b32 v14, v37, v14, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v56, v35, v17 +; SI-NEXT: v_alignbit_b32 v17, v2, v17, 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v6 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v57, v46, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v47 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_or_b32_e32 v59, v46, v43 -; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v47, v35, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v61 +; SI-NEXT: v_or_b32_e32 v59, v46, v43 +; SI-NEXT: v_alignbit_b32 v46, v52, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v37, v14, 16 +; SI-NEXT: v_mov_b32_e32 v14, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 +; SI-NEXT: v_alignbit_b32 v21, v11, v21, 16 ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; SI-NEXT: v_alignbit_b32 v43, v15, v43, 16 ; SI-NEXT: v_or_b32_e32 v61, v44, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v45 +; SI-NEXT: v_alignbit_b32 v51, v14, v51, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 ; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v44 ; SI-NEXT: v_or_b32_e32 v36, v36, v45 -; SI-NEXT: v_alignbit_b32 v44, v18, v35, 16 +; SI-NEXT: v_alignbit_b32 v44, v62, v35, 16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_alignbit_b32 v45, v31, v45, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v6, v33, v42 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v2, v17, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v6, v11, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v9, v42, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 @@ -52243,7 +52254,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v34, v34, v35 ; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v34, v35, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; SI-NEXT: v_or_b32_e32 v1, v34, v1 @@ -52266,9 +52277,11 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen @@ -52278,11 +52291,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 ; SI-NEXT: v_or_b32_e32 v1, v1, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen @@ -52292,11 +52303,9 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52337,7 +52346,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 @@ -52350,7 +52359,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52361,7 +52370,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 @@ -52373,8 +52382,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -52391,10 +52400,8 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 4a0bb6ceccd3f..1b5d4a9c1b929 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -42,14 +42,11 @@ body: | ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr18_sgpr19 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.3, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr20_sgpr21 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec + ; CHECK-NEXT: renamable $sgpr22_sgpr23 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr34_sgpr35 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr56 = S_MOV_B32 0 - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec - ; CHECK-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr12_sgpr13, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, implicit $exec ; CHECK-NEXT: renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: renamable $sgpr57 = S_MOV_B32 1083786240 @@ -58,7 +55,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]] @@ -67,7 +64,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.5(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr40 = COPY renamable $sgpr72 @@ -95,12 +92,12 @@ body: | ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 ; CHECK-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr56_sgpr57 ; CHECK-NEXT: renamable $sgpr54 = COPY killed renamable $sgpr76 - ; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 - ; CHECK-NEXT: renamable $sgpr48_sgpr49_sgpr50 = COPY renamable $sgpr52_sgpr53_sgpr54 - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr48_sgpr49_sgpr50 - ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47 - ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr68 + ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58 = COPY renamable $sgpr52_sgpr53_sgpr54 + ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 = COPY killed renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK-NEXT: renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: renamable $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55 + ; CHECK-NEXT: renamable $sgpr52_sgpr53_sgpr54 = COPY renamable $sgpr56_sgpr57_sgpr58 + ; CHECK-NEXT: renamable $sgpr55 = COPY killed renamable $sgpr76 ; CHECK-NEXT: renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $sgpr56 = COPY killed renamable $sgpr72 ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) @@ -165,23 +162,22 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.12(0x40000000), %bb.6(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 killed renamable $sgpr12_sgpr13, undef renamable $sgpr54_sgpr55, implicit-def dead $scc + ; CHECK-NEXT: renamable $sgpr12_sgpr13 = S_AND_B64 renamable $sgpr22_sgpr23, undef renamable $sgpr54_sgpr55, implicit-def dead $scc ; CHECK-NEXT: renamable $sgpr54_sgpr55 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr12_sgpr13 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.6: ; CHECK-NEXT: successors: %bb.7(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr34_sgpr35, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: ; CHECK-NEXT: successors: %bb.8(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr64_sgpr65 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $sgpr66_sgpr67 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec @@ -189,14 +185,14 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.8: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.9(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr64_sgpr65, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.10, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9: ; CHECK-NEXT: successors: %bb.10(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr84_sgpr85, implicit $exec @@ -218,11 +214,17 @@ body: | ; CHECK-NEXT: renamable $sgpr83 = COPY killed renamable $sgpr15 ; CHECK-NEXT: renamable $sgpr85 = COPY killed renamable $sgpr14 ; CHECK-NEXT: renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr18_sgpr19 + ; CHECK-NEXT: renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr20_sgpr21 + ; CHECK-NEXT: renamable $sgpr36_sgpr37 = COPY killed renamable $sgpr22_sgpr23 + ; CHECK-NEXT: renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr24_sgpr25 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $sgpr8_sgpr9 = COPY renamable $sgpr82_sgpr83 ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr12_sgpr13, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9 + ; CHECK-NEXT: renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr38_sgpr39 + ; CHECK-NEXT: renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr36_sgpr37 + ; CHECK-NEXT: renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr50_sgpr51 ; CHECK-NEXT: renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr48_sgpr49 ; CHECK-NEXT: renamable $sgpr14 = COPY killed renamable $sgpr85 ; CHECK-NEXT: renamable $sgpr15 = COPY killed renamable $sgpr83 @@ -238,44 +240,42 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.10: ; CHECK-NEXT: successors: %bb.8(0x40000000), %bb.12(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.8, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.12 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.11: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.17(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec ; CHECK-NEXT: S_BRANCH %bb.17 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.12: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.13(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr54_sgpr55, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $exec = S_MOV_B64_term killed renamable $sgpr54_sgpr55 ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.11, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.13: ; CHECK-NEXT: successors: %bb.15(0x40000000), %bb.14(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.15, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.14 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.14: ; CHECK-NEXT: successors: %bb.15(0x80000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.15: ; CHECK-NEXT: successors: %bb.11(0x40000000), %bb.16(0x40000000) - ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr100_sgpr101 + ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr12_sgpr13 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: $vcc = S_AND_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc + ; CHECK-NEXT: $vcc = S_AND_B64 $exec, renamable $sgpr20_sgpr21, implicit-def dead $scc ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.11, implicit $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.16: diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index 81f72b70d1ecb..a44f5b477f052 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -1,10 +1,8 @@ -# REQUIRES: asserts -# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -debug-only=regalloc -run-pass=greedy -filetype=null %s 2>&1 | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -run-pass=greedy -o - %s | FileCheck %s --- -# Check that physreg candidate is not used since cannot be spilled in a block, -# e.g. before exec mask preamble -# CHECK: , cannot spill all interferences. +# Check that spill save/restore should be inserted after $exec mask is defined. name: foo tracksRegLiveness: true @@ -12,6 +10,88 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr102_sgpr103 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S128_SAVE [[COPY1]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:sgpr_128 = COPY [[COPY1]] + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr96_sgpr97, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $sgpr98_sgpr99, $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr98_sgpr99, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $sgpr102_sgpr103 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_OR_SAVEEXEC_B64_1:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 $sgpr102_sgpr103, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_OR_SAVEEXEC_B64_1]], implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: $exec = IMPLICIT_DEF + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY1]].sub0_sub1, [[SI_SPILL_S128_RESTORE]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE1:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE2:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE1]].sub0_sub1, [[SI_SPILL_S128_RESTORE2]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE3:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_S128_RESTORE4:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: S_CMP_EQ_U64 [[SI_SPILL_S128_RESTORE3]].sub0_sub1, [[SI_SPILL_S128_RESTORE4]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY2]].sub0_sub1, [[COPY3]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY4]].sub0_sub1, [[COPY5]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY6]].sub0_sub1, [[COPY7]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY8]].sub0_sub1, [[COPY9]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY10]].sub0_sub1, [[COPY11]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY12]].sub0_sub1, [[COPY13]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY14]].sub0_sub1, [[COPY15]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY16]].sub0_sub1, [[COPY17]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY18]].sub0_sub1, [[COPY19]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: S_CMP_EQ_U64 [[COPY20]].sub0_sub1, [[COPY21]].sub2_sub3, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit [[S_OR_SAVEEXEC_B64_1]], implicit $vgpr0 bb.0: liveins: $sgpr96_sgpr97, $sgpr98_sgpr99, $sgpr100_sgpr101, $sgpr102_sgpr103 @@ -68,6 +148,7 @@ body: | S_BRANCH %bb.4 bb.4: + $exec = IMPLICIT_DEF S_CMP_EQ_U64 %1.sub0_sub1, %2.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %3.sub0_sub1, %4.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %5.sub0_sub1, %6.sub2_sub3, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll index b5474b8974b29..1c5f221dd679b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -9742,170 +9742,122 @@ entry: define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: test_limited_sgpr: ; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9 +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_mov_b32 s18, 0 +; GFX6-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-NEXT: s_mov_b32 s19, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[16:17], s[14:15] +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240 ; GFX6-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 ; GFX6-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s42, -1 ; GFX6-NEXT: s_mov_b32 s43, 0xe8f000 ; GFX6-NEXT: s_add_u32 s40, s40, s11 -; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b64 s[4:5], exec -; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_writelane_b32 v1, s0, 0 -; GFX6-NEXT: v_writelane_b32 v1, s1, 1 -; GFX6-NEXT: v_writelane_b32 v1, s2, 2 -; GFX6-NEXT: v_writelane_b32 v1, s3, 3 -; GFX6-NEXT: s_mov_b32 s8, 0x80400 -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 -; GFX6-NEXT: s_mov_b32 s2, 0x86a00 -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 -; GFX6-NEXT: s_mov_b32 s2, 0x86600 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 -; GFX6-NEXT: s_mov_b32 s2, 0x86200 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 -; GFX6-NEXT: s_mov_b32 s2, 0x85e00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 -; GFX6-NEXT: s_mov_b32 s2, 0x85a00 +; GFX6-NEXT: s_mov_b32 s0, 0x85e00 +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[16:19], 0 addr64 offset:32 +; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[16:19], 0 addr64 offset:48 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224 +; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 -; GFX6-NEXT: s_mov_b32 s2, 0x85600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208 +; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 -; GFX6-NEXT: s_mov_b32 s2, 0x85200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192 +; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 -; GFX6-NEXT: s_mov_b32 s2, 0x84e00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176 +; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 -; GFX6-NEXT: s_mov_b32 s2, 0x84a00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160 +; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 -; GFX6-NEXT: s_mov_b32 s2, 0x84600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144 +; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x84200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128 +; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x83a00 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112 +; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x83200 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96 +; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32 -; GFX6-NEXT: s_mov_b32 s2, 0x83600 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80 +; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[16:19], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s0, 0 -; GFX6-NEXT: v_writelane_b32 v4, s1, 1 -; GFX6-NEXT: v_writelane_b32 v4, s2, 2 -; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s10, 0x80800 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48 -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[16:19], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: v_mov_b32_e32 v7, 1 ; GFX6-NEXT: s_mov_b64 s[0:1], exec @@ -9924,22 +9876,76 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: s_mov_b32 s2, 0x80400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[8:15] +; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: v_writelane_b32 v4, s8, 4 +; GFX6-NEXT: v_writelane_b32 v4, s9, 5 +; GFX6-NEXT: v_writelane_b32 v4, s10, 6 +; GFX6-NEXT: v_writelane_b32 v4, s11, 7 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[16:23] +; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: v_writelane_b32 v4, s8, 4 +; GFX6-NEXT: v_writelane_b32 v4, s9, 5 +; GFX6-NEXT: v_writelane_b32 v4, s10, 6 +; GFX6-NEXT: v_writelane_b32 v4, s11, 7 +; GFX6-NEXT: s_mov_b32 s2, 0x81400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[24:31] +; GFX6-NEXT: ; def s[0:7] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s0, 0 +; GFX6-NEXT: v_writelane_b32 v4, s1, 1 +; GFX6-NEXT: v_writelane_b32 v4, s2, 2 +; GFX6-NEXT: v_writelane_b32 v4, s3, 3 +; GFX6-NEXT: v_writelane_b32 v4, s4, 4 +; GFX6-NEXT: v_writelane_b32 v4, s5, 5 +; GFX6-NEXT: v_writelane_b32 v4, s6, 6 +; GFX6-NEXT: v_writelane_b32 v4, s7, 7 +; GFX6-NEXT: s_mov_b32 s10, 0x81c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[0:3] ; GFX6-NEXT: ;;#ASMEND @@ -9950,33 +9956,28 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s8, 0 -; GFX6-NEXT: v_writelane_b32 v4, s9, 1 -; GFX6-NEXT: v_writelane_b32 v4, s10, 2 -; GFX6-NEXT: v_writelane_b32 v4, s11, 3 -; GFX6-NEXT: v_writelane_b32 v4, s12, 4 -; GFX6-NEXT: v_writelane_b32 v4, s13, 5 -; GFX6-NEXT: v_writelane_b32 v4, s14, 6 -; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x81400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v4, s12, 0 +; GFX6-NEXT: v_writelane_b32 v4, s13, 1 +; GFX6-NEXT: v_writelane_b32 v4, s14, 2 +; GFX6-NEXT: v_writelane_b32 v4, s15, 3 +; GFX6-NEXT: s_mov_b32 s10, 0x82400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: s_mov_b64 s[20:21], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x80c00 +; GFX6-NEXT: s_mov_b32 s22, 0x80400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s22 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -9988,31 +9989,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: s_mov_b64 exec, s[20:21] +; GFX6-NEXT: s_mov_b64 s[20:21], exec +; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s16, 0 ; GFX6-NEXT: v_writelane_b32 v4, s17, 1 ; GFX6-NEXT: v_writelane_b32 v4, s18, 2 ; GFX6-NEXT: v_writelane_b32 v4, s19, 3 -; GFX6-NEXT: v_writelane_b32 v4, s20, 4 -; GFX6-NEXT: v_writelane_b32 v4, s21, 5 -; GFX6-NEXT: v_writelane_b32 v4, s22, 6 -; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x81c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s22, 0x82c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s22 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[20:21] +; GFX6-NEXT: s_mov_b64 s[24:25], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x81400 +; GFX6-NEXT: s_mov_b32 s26, 0x80c00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s26 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10024,31 +10021,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s24, 0 -; GFX6-NEXT: v_writelane_b32 v4, s25, 1 -; GFX6-NEXT: v_writelane_b32 v4, s26, 2 -; GFX6-NEXT: v_writelane_b32 v4, s27, 3 -; GFX6-NEXT: v_writelane_b32 v4, s28, 4 -; GFX6-NEXT: v_writelane_b32 v4, s29, 5 -; GFX6-NEXT: v_writelane_b32 v4, s30, 6 -; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s34, 0x82400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[24:25] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s34, 0x81c00 +; GFX6-NEXT: s_mov_b32 s36, 0x81400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10060,8 +10039,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] -; GFX6-NEXT: s_mov_b64 s[6:7], exec +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10069,12 +10048,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s34, 0x82c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x82800 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[6:7] +; GFX6-NEXT: s_mov_b64 exec, s[34:35] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 3 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10087,10 +10066,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 vcc, s[6:7] ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s36, 0x82400 +; GFX6-NEXT: s_mov_b32 s36, 0x81c00 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10108,7 +10088,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: s_mov_b64 s[34:35], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s44, 0x82c00 +; GFX6-NEXT: s_mov_b32 s44, 0x82800 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10134,84 +10114,102 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: .LBB1_2: ; %ret -; GFX6-NEXT: s_or_b64 exec, exec, vcc -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 s[6:7], vcc +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80400 +; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s16, v4, 0 +; GFX6-NEXT: v_readlane_b32 s17, v4, 1 +; GFX6-NEXT: v_readlane_b32 s18, v4, 2 +; GFX6-NEXT: v_readlane_b32 s19, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1] -; GFX6-NEXT: s_mov_b64 s[4:5], exec +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_mov_b32 s6, 0x80800 +; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v4, 0 -; GFX6-NEXT: v_readlane_b32 s1, v4, 1 -; GFX6-NEXT: v_readlane_b32 s2, v4, 2 -; GFX6-NEXT: v_readlane_b32 s3, v4, 3 +; GFX6-NEXT: v_readlane_b32 s12, v4, 0 +; GFX6-NEXT: v_readlane_b32 s13, v4, 1 +; GFX6-NEXT: v_readlane_b32 s14, v4, 2 +; GFX6-NEXT: v_readlane_b32 s15, v4, 3 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s0, 0x86a00 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[38:39], s[2:3] +; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b32 s0, 0x86200 +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_mov_b32 s0, 0x86600 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s0, 0x86a00 +; GFX6-NEXT: s_waitcnt expcnt(4) +; GFX6-NEXT: v_mov_b32_e32 v0, v20 +; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: v_mov_b32_e32 v1, v21 +; GFX6-NEXT: v_mov_b32_e32 v2, v22 +; GFX6-NEXT: v_mov_b32_e32 v3, v23 +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt expcnt(2) +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s0, 0x86600 +; GFX6-NEXT: v_mov_b32_e32 v23, v3 +; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x86200 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: v_mov_b32_e32 v22, v2 +; GFX6-NEXT: v_mov_b32_e32 v21, v1 +; GFX6-NEXT: v_mov_b32_e32 v20, v0 +; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v3, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: .LBB1_2: ; %ret +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX6-NEXT: s_mov_b32 s0, 0x85e00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[14:15], s[18:19] ; GFX6-NEXT: s_mov_b32 s0, 0x85a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10219,7 +10217,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10227,7 +10225,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x85200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10235,7 +10233,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10243,7 +10241,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10251,7 +10249,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10259,23 +10257,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x84200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83a00 +; GFX6-NEXT: s_mov_b32 s0, 0x83e00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s0, 0x83e00 +; GFX6-NEXT: s_mov_b32 s0, 0x83a00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10283,7 +10281,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83600 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload @@ -10291,15 +10289,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s0, 0x83200 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[12:15], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[12:15], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[12:15], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[12:15], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[12:15], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll index 3913e93b83a66..1dddc29deae25 100644 --- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll @@ -31,22 +31,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 ; CHECK-NEXT: .cfi_offset %ebp, -8 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.1: ## %bb116.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.2: ## %bb52.i.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.3: ## %bb142.i -; CHECK-NEXT: je LBB0_25 +; CHECK-NEXT: je LBB0_7 ; CHECK-NEXT: ## %bb.4: +; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi ; CHECK-NEXT: movb $1, %bh ; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_21: ## %bb7806 +; CHECK-NEXT: LBB0_23: ## %bb7806 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp16: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -57,50 +58,50 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: LBB0_5: ## %bb3261 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: cmpl $37, 0 -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: ## %bb.6: ## %bb3306 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: ## %bb.8: ## %bb3306 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll __ZN12wxStringBaseaSEPKw ; CHECK-NEXT: Ltmp1: ## EH_LABEL -; CHECK-NEXT: ## %bb.7: ## %bb3314 +; CHECK-NEXT: ## %bb.9: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: movl 0, %eax ; CHECK-NEXT: cmpl $121, %eax -; CHECK-NEXT: ja LBB0_25 -; CHECK-NEXT: ## %bb.8: ## %bb3314 +; CHECK-NEXT: ja LBB0_6 +; CHECK-NEXT: ## %bb.10: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4) -; CHECK-NEXT: LBB0_10: ## %bb5809 +; CHECK-NEXT: LBB0_12: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: ## %bb.11: ## %bb5809 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: ## %bb.13: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb %bh, %bh -; CHECK-NEXT: je LBB0_25 -; CHECK-NEXT: ## %bb.12: ## %bb91.i8504 +; CHECK-NEXT: je LBB0_6 +; CHECK-NEXT: ## %bb.14: ## %bb91.i8504 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_14 -; CHECK-NEXT: ## %bb.13: ## %bb155.i8541 +; CHECK-NEXT: je LBB0_16 +; CHECK-NEXT: ## %bb.15: ## %bb155.i8541 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll _gmtime_r ; CHECK-NEXT: Ltmp5: ## EH_LABEL -; CHECK-NEXT: LBB0_14: ## %bb182.i8560 +; CHECK-NEXT: LBB0_16: ## %bb182.i8560 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_15 -; CHECK-NEXT: ## %bb.16: ## %bb278.i8617 +; CHECK-NEXT: je LBB0_17 +; CHECK-NEXT: ## %bb.18: ## %bb278.i8617 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: je LBB0_18 -; CHECK-NEXT: ## %bb.17: ## %bb440.i8663 +; CHECK-NEXT: je LBB0_20 +; CHECK-NEXT: ## %bb.19: ## %bb440.i8663 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax @@ -113,11 +114,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5 ; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_ ; CHECK-NEXT: Ltmp7: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_18 -; CHECK-NEXT: LBB0_15: ## %bb187.i8591 +; CHECK-NEXT: jmp LBB0_20 +; CHECK-NEXT: LBB0_17: ## %bb187.i8591 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: jne LBB0_25 -; CHECK-NEXT: LBB0_18: ## %invcont5814 +; CHECK-NEXT: jne LBB0_6 +; CHECK-NEXT: LBB0_20: ## %invcont5814 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -126,7 +127,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp9: ## EH_LABEL -; CHECK-NEXT: ## %bb.19: ## %invcont5831 +; CHECK-NEXT: ## %bb.21: ## %invcont5831 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp10: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -136,7 +137,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN12wxStringBase10ConcatSelfEmPKwm ; CHECK-NEXT: Ltmp11: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_9: ## %bb5657 +; CHECK-NEXT: LBB0_11: ## %bb5657 ; CHECK-NEXT: Ltmp13: ## EH_LABEL ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -144,8 +145,8 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE ; CHECK-NEXT: Ltmp14: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_20: ## %bb5968 +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_22: ## %bb5968 ; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -153,23 +154,24 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp3: ## EH_LABEL -; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i +; CHECK-NEXT: LBB0_6: ## %bb3267 ; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: LBB0_7: ## %bb115.critedge.i ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl $4 -; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp +; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp ; CHECK-NEXT: Ltmp15: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_24: ## %lpad8185 +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_26: ## %lpad8185 ; CHECK-NEXT: Ltmp12: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 -; CHECK-NEXT: LBB0_22: ## %lpad.loopexit +; CHECK-NEXT: jmp LBB0_6 +; CHECK-NEXT: LBB0_24: ## %lpad.loopexit ; CHECK-NEXT: Ltmp18: ## EH_LABEL -; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: entry: br i1 %foo, label %bb116.i, label %bb115.critedge.i