diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8937679e460f3..a27d4eeee97f4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7227,27 +7227,52 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) { return DeferredList.contains(MI); } -// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode, -// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add -// subreg access properly. This can be removed after we have sgpr16 in place -void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst, +// Legalize size mismatches between 16bit and 32bit registers in v2s copy +// lowering (change spgr to vgpr). +// This is mainly caused by 16bit SALU and 16bit VALU using reg with different +// size. Need to legalize the size of the operands during the vgpr lowering +// chain. This can be removed after we have sgpr16 in place +void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx, MachineRegisterInfo &MRI) const { - unsigned Opcode = Inst.getOpcode(); - if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts()) + if (!ST.useRealTrue16Insts()) return; - for (MachineOperand &Op : Inst.explicit_operands()) { - unsigned OpIdx = Op.getOperandNo(); - if (!OpIdx) - continue; - if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) { - unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; - const TargetRegisterClass *RC = RI.getRegClass(RCID); - if (RI.getRegSizeInBits(*RC) == 16) { - Op.setSubReg(AMDGPU::lo16); - } - } - } + unsigned Opcode = MI.getOpcode(); + MachineBasicBlock *MBB = MI.getParent(); + // Legalize operands and check for size mismatch + if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() || + OpIdx >= get(Opcode).getNumOperands()) + return; + + MachineOperand &Op = MI.getOperand(OpIdx); + if (!Op.isReg() || !Op.getReg().isVirtual()) + return; + + const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg()); + if (!RI.isVGPRClass(CurrRC)) + return; + + unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID); + if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) { + Op.setSubReg(AMDGPU::lo16); + } else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) { + const DebugLoc &DL = MI.getDebugLoc(); + Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef); + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg) + .addReg(Op.getReg()) + .addImm(AMDGPU::lo16) + .addReg(Undef) + .addImm(AMDGPU::hi16); + Op.setReg(NewDstReg); + } +} +void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, + MachineRegisterInfo &MRI) const { + for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++) + legalizeOperandsVALUt16(MI, OpIdx, MRI); } void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, @@ -7769,15 +7794,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, return; } - // If this is a v2s copy src from 16bit to 32bit, - // replace vgpr copy to reg_sequence + // If this is a v2s copy between 16bit and 32bit reg, + // replace vgpr copy to reg_sequence/extract_subreg // This can be remove after we have sgpr16 in place if (ST.useRealTrue16Insts() && Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1); - if (16 == RI.getRegSizeInBits(*SrcRegRC) && - 32 == RI.getRegSizeInBits(*NewDstRC)) { + if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) { Register NewDstReg = MRI.createVirtualRegister(NewDstRC); Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass); BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), @@ -7789,7 +7813,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .addReg(Undef) .addImm(AMDGPU::hi16); Inst.eraseFromParent(); - + MRI.replaceRegWith(DstReg, NewDstReg); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; + } else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC, + AMDGPU::lo16)) { + Inst.getOperand(1).setSubReg(AMDGPU::lo16); + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); return; @@ -7885,23 +7915,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, assert(NewDstRC); NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); - - // Check useMI of NewInstr. If used by a true16 instruction, - // add a lo16 subreg access if size mismatched - // This can be remove after we have sgpr16 in place - if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) { - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); - I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - unsigned UseMIOpcode = UseMI.getOpcode(); - if (AMDGPU::isTrue16Inst(UseMIOpcode) && - (16 == - RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) { - I->setSubReg(AMDGPU::lo16); - } - } - } } fixImplicitOperands(*NewInstr); @@ -8709,6 +8722,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( ++I; } while (I != E && I->getParent() == &UseMI); } else { + legalizeOperandsVALUt16(UseMI, OpNo, MRI); + ++I; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 64ab064a75f44..01dd3c9f4119e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1304,6 +1304,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Fix operands in Inst to fix 16bit SALU to VALU lowering. void legalizeOperandsVALUt16(MachineInstr &Inst, MachineRegisterInfo &MRI) const; + void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx, + MachineRegisterInfo &MRI) const; /// Replace the instructions opcode with the equivalent VALU /// opcode. This function will also move the users of MachineInstruntions diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 44abfd272be88..9126b08857153 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -21659,134 +21659,119 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 @@ -21799,305 +21784,305 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -22184,31 +22169,31 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -22224,61 +22209,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -22289,63 +22274,63 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -22356,61 +22341,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -22421,61 +22406,61 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -22486,52 +22471,52 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -60371,134 +60356,119 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB39_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 @@ -60511,305 +60481,305 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -60896,31 +60866,31 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -60936,61 +60906,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -61001,63 +60971,63 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -61068,61 +61038,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -61133,61 +61103,61 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -61198,52 +61168,52 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -97102,134 +97072,119 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB59_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 @@ -97242,305 +97197,305 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -97627,31 +97582,31 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -97667,61 +97622,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -97732,63 +97687,63 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -97799,61 +97754,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -97864,61 +97819,61 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -97929,52 +97884,52 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -133776,134 +133731,119 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v86, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:312 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:308 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:304 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:300 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:264 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:260 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:256 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:248 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:240 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:200 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:192 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:308 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:304 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:300 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:296 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:260 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:252 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:244 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:236 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:208 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:200 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:192 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:184 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:176 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:168 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:160 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v149, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:184 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:176 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v183, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v40, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v41, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v42, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v43, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v45, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v44, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v46, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v47, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v57, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v56, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v58, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v59, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v61, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v60, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v163, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v164, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v166, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v167, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v176, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v178, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v177, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v179, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v180, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v182, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v181, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v58.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v63.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v75.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v88.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v90.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v62, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v63, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v73, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v72, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v74, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v75, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v76, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v77, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v78, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v79, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v88, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v90, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v93, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v28 :: v_dual_mov_b32 v34, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v25 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v24 :: v_dual_mov_b32 v38, v23 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v22 :: v_dual_mov_b32 v49, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v20 :: v_dual_mov_b32 v51, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v18 :: v_dual_mov_b32 v53, v17 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v55, v16 :: v_dual_mov_b32 v54, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v64, v14 :: v_dual_mov_b32 v65, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v12 :: v_dual_mov_b32 v66, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v68, v10 :: v_dual_mov_b32 v69, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v8 :: v_dual_mov_b32 v70, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v80, v6 :: v_dual_mov_b32 v81, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v83, v4 :: v_dual_mov_b32 v82, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v2 :: v_dual_mov_b32 v85, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB75_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v91 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v89 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v88 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v79 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v77 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v78 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v76 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 @@ -133916,305 +133856,305 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s5, s6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v75 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v73 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v72 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v65 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s5, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s1, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v74 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v67 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s2, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s3, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s18, 0xff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s19, 8 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s27, 8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v63 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v55 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v56 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v49 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v59 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v58 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v51 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v47 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v45 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v39 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v43 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v180 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v91 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v183 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v182 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v93 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v176 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v79 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v177 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v90 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v88 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v178 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v89 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v164 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v75 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v167 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v166 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v78 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v77 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v76 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v160 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v63 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v163 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v161 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v74 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v72 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v162 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v59 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v151 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v62 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v60 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v150 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v61 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v47 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v147 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v56 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v146 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v46 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v44 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v134 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v45 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v183 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v130 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v41 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v40 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v179 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v117 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v182 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v180 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v118 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v181 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v167 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v178 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v176 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v177 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v100 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v163 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v101 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v164 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v165 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v151 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v162 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v160 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v161 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v147 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v149 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v148 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v135 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v82 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v131 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v70 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v133 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v128 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v129 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v114 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v117 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v102 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v113 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v112 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v98 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v103 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v100 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v101 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v87 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v96 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v93 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -134301,31 +134241,31 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v86 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v92 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v85 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v91 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v90 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v88 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v84 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v89 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v78 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v71 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v77 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v70 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v76 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -134341,61 +134281,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v74 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v73 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v72 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v63 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v65 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v64 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v8, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v61 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v60 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v59 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v52 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v58 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v51 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v57 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v50 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v2, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v56 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v47 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v46 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v45 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v44 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v43 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v42 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v41 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v40 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -134406,63 +134346,63 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v3, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v182 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v93 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v181 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v92 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v180 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v179 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v91 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v90 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v89 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v13, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v177 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v88 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v176 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v167 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v79 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v78 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v166 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v77 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v165 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 3, v76 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v164 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v163 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v75 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v74 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v162 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v73 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v161 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v72 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v160 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v151 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v150 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v63 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v62 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v61 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v15, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v149 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v60 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v148 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v59 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -134473,61 +134413,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v58 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v57 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v145 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v56 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v135 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v47 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v46 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v45 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v18, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v44 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v132 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v43 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v42 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v17, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v18, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v41 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v129 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v40 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v128 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v183 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 3, v182 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v181 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v19, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v180 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v116 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v115 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v179 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v178 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 3, v177 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v19, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v20, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v112 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v167 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -134538,61 +134478,61 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v21, 3, v166 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v3, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v102 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v165 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v101 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v100 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v99 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v163 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v162 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v161 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v160 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v96 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v87 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v151 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v150 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v22, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v23, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v149 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v23, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v148 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v147 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 3, v146 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v0, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v80 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v71 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v135 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v134 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v133 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v131 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -134603,52 +134543,52 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v1, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v67 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 3, v130 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v3, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 8, v128 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v129 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 8, v113 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v117 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 8, v114 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v28, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v116 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v30, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v101 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v28, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v29, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 0x300, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v49 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v28, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v34, 3, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v102 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xff, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 8, v96 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v87 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v33, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v32, v34 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v36, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v37, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 @@ -160847,8 +160787,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 @@ -160898,72 +160839,56 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB89_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -161021,20 +160946,20 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -161042,48 +160967,48 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 @@ -161428,84 +161353,84 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 @@ -161611,16 +161536,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -161639,7 +161564,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 @@ -188892,8 +188817,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 @@ -188943,72 +188869,56 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB93_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -189066,20 +188976,20 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -189087,48 +188997,48 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 @@ -189473,84 +189383,84 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 @@ -189656,16 +189566,16 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -189684,7 +189594,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 @@ -212549,8 +212459,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:328 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:320 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v50, v6 :: v_dual_mov_b32 v33, v0 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v89, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v92, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v91, off, s32 offset:304 @@ -212600,72 +212511,56 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v151, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v162, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v147, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v148, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v150, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v144, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v132, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v135, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v145, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v129, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v133, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v134, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v117, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v119, off, s32 offset:64 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v146, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v128, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v131, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v130, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v118, off, s32 offset:44 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v113, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v116, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:20 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v86, v30 :: v_dual_mov_b32 v81, v29 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v84, v28 :: v_dual_mov_b32 v83, v27 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v85, v26 :: v_dual_mov_b32 v80, v24 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v69, v25 :: v_dual_mov_b32 v82, v22 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v71, v23 :: v_dual_mov_b32 v64, v21 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v67, v20 :: v_dual_mov_b32 v68, v19 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v70, v18 :: v_dual_mov_b32 v55, v16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v52, v17 :: v_dual_mov_b32 v65, v15 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v66, v14 :: v_dual_mov_b32 v49, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v11 :: v_dual_mov_b32 v36, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v37, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v7 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v5 :: v_dual_mov_b32 v39, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v32, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB97_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -212723,20 +212618,20 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v85 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v83 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v98 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v98 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v84 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v85 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v103 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v87 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v101 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -212744,48 +212639,48 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v13, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v96 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v97 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v112 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v112 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v103 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v100 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v118 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v115 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v133 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v115 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v116 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v113 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v130 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v119 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v129 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v133 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v102 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v128 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v117 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v145 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v144 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v146 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v145 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v3, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v132 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v132 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v114 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v135 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xff, v131 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v129 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v15, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v144 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v18, 16, v19 @@ -213130,84 +213025,84 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v147 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v146 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v134 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v144 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v144 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v132 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v145 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v135 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v134 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v119 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v129 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v117 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v119, 0x300, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v133 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v131 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v131, 0x300, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v132, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v114 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v129 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v146 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v114, 0x300, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v135 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v130 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v129, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v128, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v133 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v119 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v102 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v117 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v131 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v102, 0x300, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v128 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v118 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v117, 0x300, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v116 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v118 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v115 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v115 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v113 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v116 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v114 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v113 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v112 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v112 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v103 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v102 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v101 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v99 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v100 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v98 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v86 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v96 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v55, 3, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v96 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v97 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v97 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v98 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v85 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v86 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v87 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v85 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v84 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v5 @@ -213313,16 +213208,16 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v70, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v119 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v132 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v17 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v20, v20, 16, v34 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v16, 16, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v129 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v114, 16, v33 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v128 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v18, v129, 16, v33 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v21, 16, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v176 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v102, 16, v32 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v17, v117, 16, v32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v164 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 @@ -213341,7 +213236,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v117, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v131, 16, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v24, v167, 16, v24 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v27, v45, 16, v32 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v28, v57, 16, v33 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 178718a338432..8dc00701dcfd6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -7393,15 +7393,10 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 @@ -7441,27 +7436,27 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 @@ -7470,14 +7465,14 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -7535,22 +7530,22 @@ define inreg <8 x i32> @bitcast_v32i8_to_v8i32_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -14771,15 +14766,10 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 @@ -14819,27 +14809,27 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 @@ -14848,14 +14838,14 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -14913,22 +14903,22 @@ define inreg <8 x float> @bitcast_v32i8_to_v8f32_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -21656,15 +21646,10 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 @@ -21704,27 +21689,27 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 @@ -21733,14 +21718,14 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -21798,22 +21783,22 @@ define inreg <4 x i64> @bitcast_v32i8_to_v4i64_scalar(<32 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -28039,15 +28024,10 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v15, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v3 :: v_dual_mov_b32 v19, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v21, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 @@ -28087,27 +28067,27 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v21 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v15 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xff, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 8, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v14, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v25 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 @@ -28116,14 +28096,14 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v3, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v22, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v23 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -28181,22 +28161,22 @@ define inreg <4 x double> @bitcast_v32i8_to_v4f64_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v16 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v17 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v22 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v9 @@ -34105,20 +34085,10 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 @@ -34149,25 +34119,25 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 @@ -34231,42 +34201,42 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -34280,7 +34250,7 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -39305,20 +39275,10 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 @@ -39349,25 +39309,25 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 @@ -39431,42 +39391,42 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -39480,7 +39440,7 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 @@ -43653,20 +43613,10 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v21, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v18, v6 :: v_dual_mov_b32 v19, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v15, v1 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v22, v2 :: v_dual_mov_b32 v17, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 @@ -43697,25 +43647,25 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v16 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xff, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 8, v13 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 @@ -43779,42 +43729,42 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v10 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v22 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v15 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 @@ -43828,7 +43778,7 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v3, 16, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v8, 16, v7 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v2, 16, v0 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index d966d136d75b6..73c730f3c30dd 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -6469,17 +6469,11 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB15_4 @@ -6509,7 +6503,7 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -6527,29 +6521,29 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 @@ -6557,9 +6551,9 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -6568,12 +6562,12 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -6631,36 +6625,36 @@ define inreg <10 x i32> @bitcast_v40i8_to_v10i32_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 @@ -13954,17 +13948,11 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v9 :: v_dual_mov_b32 v23, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v25, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v5 :: v_dual_mov_b32 v27, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_mov_b32 v31, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB35_4 @@ -13994,7 +13982,7 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -14012,29 +14000,29 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v27 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 8, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xff, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v35, 8, v19 @@ -14042,9 +14030,9 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v32, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v22, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v34, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v36, v37 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v36, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -14053,12 +14041,12 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v34, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 ; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 @@ -14116,36 +14104,36 @@ define inreg <10 x float> @bitcast_v40i8_to_v10f32_scalar(<40 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v30 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v28 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v27 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 @@ -21021,28 +21009,14 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 @@ -21073,44 +21047,44 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 @@ -21170,61 +21144,61 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -27569,28 +27543,14 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v15 :: v_dual_mov_b32 v25, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v27, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v36, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v10 :: v_dual_mov_b32 v24, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v23, v7 :: v_dual_mov_b32 v26, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v5 :: v_dual_mov_b32 v35, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v3 :: v_dual_mov_b32 v38, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_mov_b32 v37, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB63_4 @@ -27621,44 +27581,44 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v38 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v1, 0xffff, s10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v23 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v0, 16, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v19 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v29 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v5 @@ -27718,61 +27678,61 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v20 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v18 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v31 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v21 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v29 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v23 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v32 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v25 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v35 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v37 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v28 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 0x300, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -31989,23 +31949,14 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB73_4 @@ -32035,7 +31986,7 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -32053,29 +32004,29 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 @@ -32157,49 +32108,49 @@ define inreg <5 x double> @bitcast_v40i8_to_v5f64_scalar(<40 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 @@ -36633,23 +36584,14 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v0.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v38, v15 :: v_dual_mov_b32 v23, v13 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v24, v14 :: v_dual_mov_b32 v25, v12 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v26, v11 :: v_dual_mov_b32 v27, v9 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v29, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v30, v8 :: v_dual_mov_b32 v31, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v5 :: v_dual_mov_b32 v33, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v34, v4 :: v_dual_mov_b32 v35, v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_mov_b32 v37, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB77_4 @@ -36679,7 +36621,7 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_or_b32 s7, s7, s8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36 ; GFX11-TRUE16-NEXT: s_and_b32 s7, s7, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s8, s8, 16 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s24, 0xff @@ -36697,29 +36639,29 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v32 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v26 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v19 @@ -36801,49 +36743,49 @@ define inreg <5 x i64> @bitcast_v40i8_to_v5i64_scalar(<40 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: s_and_b32 s3, s3, 0xffff ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v36 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s3, s4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v34 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v33 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v32 ; GFX11-TRUE16-NEXT: s_and_b32 s5, s28, 0xff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s29, 8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v33 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s6, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v16 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v24 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v17 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 397955a8a8928..ca27410a1c127 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -15124,42 +15124,34 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -15196,37 +15188,37 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 @@ -15267,28 +15259,28 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -15355,7 +15347,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54 ; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 @@ -15367,14 +15359,14 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 @@ -15387,31 +15379,31 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -15461,42 +15453,42 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -30479,42 +30471,34 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -30551,37 +30535,37 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 @@ -30622,28 +30606,28 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -30710,7 +30694,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54 ; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 @@ -30722,14 +30706,14 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 @@ -30742,31 +30726,31 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -30816,42 +30800,42 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -45105,42 +45089,34 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -45177,37 +45153,37 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 @@ -45248,28 +45224,28 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -45336,7 +45312,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54 ; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 @@ -45348,14 +45324,14 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 @@ -45368,31 +45344,31 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -45442,42 +45418,42 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -58885,42 +58861,34 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_mov_b32 v54, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v15 :: v_dual_mov_b32 v32, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v13 :: v_dual_mov_b32 v34, v11 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v12 :: v_dual_mov_b32 v36, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v9 :: v_dual_mov_b32 v38, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v39, v8 :: v_dual_mov_b32 v48, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v5 :: v_dual_mov_b32 v50, v3 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v4 :: v_dual_mov_b32 v52, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v84 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v1.l +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -58957,37 +58925,37 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s9, 0xffff ; GFX11-TRUE16-NEXT: s_and_b32 s9, s26, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s10, s27, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v85 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v54 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, s10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s9, 16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v49 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v83 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s11, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s12, s29, 8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: s_or_b32 s10, s11, s12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s10, 0xffff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v34 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s10, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v69 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v70 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v64 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xff, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v7 @@ -59028,28 +58996,28 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v29 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v13, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v82 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v86 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v48 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v37 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v98, 0xff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v14, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v86, v87 @@ -59116,7 +59084,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s7, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s4, s5, s4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v54 ; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s3, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 @@ -59128,14 +59096,14 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s2, s2, s3 ; GFX11-TRUE16-NEXT: s_or_b32 s3, s4, s5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v84 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v83 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v80 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v49 ; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s28, 0xff ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 @@ -59148,31 +59116,31 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s4, 0xffff -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, s4, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v68 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v36 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v1, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v35 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v64 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v55 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 @@ -59222,42 +59190,42 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v28 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v29 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v1, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v3 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 3, v83 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v50 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v84 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v37 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 3, v69 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v67 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v12, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v48 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v38 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 3, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 8, v68 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v71 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xff, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 8, v66 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v64 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v55 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v15 @@ -72878,57 +72846,34 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -72952,8 +72897,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 @@ -72961,14 +72906,14 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 @@ -72977,10 +72922,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 @@ -72988,63 +72933,63 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -73103,151 +73048,151 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 @@ -85910,57 +85855,34 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -85984,8 +85906,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 @@ -85993,14 +85915,14 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 @@ -86009,10 +85931,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 @@ -86020,63 +85942,63 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -86135,151 +86057,151 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 @@ -97280,57 +97202,34 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16_scalar: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_mov_b32 v52, v0 ; GFX11-TRUE16-NEXT: s_clause 0xf -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:4 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v33, v15 :: v_dual_mov_b32 v34, v14 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v37, v13 :: v_dual_mov_b32 v54, v10 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v51, v12 :: v_dual_mov_b32 v50, v7 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v49, v11 :: v_dual_mov_b32 v48, v8 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v35, v9 :: v_dual_mov_b32 v36, v4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v53, v6 :: v_dual_mov_b32 v38, v5 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v32, v3 :: v_dual_mov_b32 v39, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v86 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s5, vcc_lo, exec_lo ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false @@ -97354,8 +97253,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v32 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s7, s8, s9 ; GFX11-TRUE16-NEXT: s_and_b32 s8, s24, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s9, s25, 8 @@ -97363,14 +97262,14 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s27, 8 ; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, s9 ; GFX11-TRUE16-NEXT: s_or_b32 s9, s10, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v31 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v52 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 8, v38 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v53 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v50 ; GFX11-TRUE16-NEXT: s_and_b32 s10, s28, 0xff ; GFX11-TRUE16-NEXT: s_lshl_b32 s11, s29, 8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 @@ -97379,10 +97278,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_and_b32_e64 v3, 0xffff, s9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v48 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v35 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v54 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v49 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 @@ -97390,63 +97289,63 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v34 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v17 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v0, 16, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v8 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v19 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 8, v23 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v2, 16, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v21 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v37 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 8, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xff, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 8, v27 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v10 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v49 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v81 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v29 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v30 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v69 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v66 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v1, 16, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v13 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v64 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 8, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v85 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v84 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v82 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 8, v69 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v80 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v83 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v71 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 8, v83 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v86, v87 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v50 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v80 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v68 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xff, v70 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 8, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v96, 0xff, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 8, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v11 @@ -97505,151 +97404,151 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX11-TRUE16-NEXT: s_addk_i32 s1, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s10, 0x300 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v80 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 3, v84 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s9, s10 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v83 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 3, v85 ; GFX11-TRUE16-NEXT: s_addk_i32 s5, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s6, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s7, 0x300 ; GFX11-TRUE16-NEXT: s_addk_i32 s8, 0x300 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v70 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v82 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s7, s8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v68 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v81 ; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s5, s6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v71 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v83 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v80 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v85 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 3, v71 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v54 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 3, v70 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v50 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 8, v67 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v84 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v69 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v82 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 3, v68 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v81 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 8, v65 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v69 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v66 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v66 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 3, v22 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 3, v18 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v64 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v13, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v28 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v52 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v49 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v29 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v37 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 8, v25 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v37, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v18, 0x300, v5 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v10 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v36 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v35 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v20 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, 0x300, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v34 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0x300, v4 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 8, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v31, 3, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v32, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v54 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 8, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v27, 3, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v29, 0x300, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 8, v49 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 3, v51 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v20, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v8, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v28, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v26, 0x300, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v37 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0x300, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 3, v53 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 3, v48 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, 3, v39 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v50 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 8, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 3, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v20, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v19, 0x300, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0x300, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 8, v32 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v24, 3, v36 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v25, 0x300, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v22, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v23, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 8, v38 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v22, 0x300, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 3, v52 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v23, 0x300, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x300, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v18, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 8, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX11-TRUE16-NEXT: s_addk_i32 s4, 0x300 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v0, 0x300, v0 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x300, v6 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e64 v8, 0xffff, s4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0x300, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0x300, v1 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, 0x300, v2 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0x300, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v5, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v23, 16, v8 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v25, 16, v16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v5, v6, 16, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v7, v25, 16, v8 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v8, v16, 16, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v29, 16, v18 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v19 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v37, 16, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v6, v23, 16, v6 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v20, 16, v19 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v10, 16, v21 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v18, 16, v15 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 16, v12 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v13, v13, 16, v3 ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v14, v2, 16, v16 diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index b27ad26cf97b9..4cb5b7c43a46d 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -77,18 +77,19 @@ define amdgpu_kernel void @br_cc_f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %one -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_endpgm +; GFX11-TRUE16-NEXT: s_branch .LBB0_3 ; GFX11-TRUE16-NEXT: .LBB0_2: ; %two -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: .LBB0_3: ; %one +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: br_cc_f16: @@ -192,13 +193,15 @@ define amdgpu_kernel void @br_cc_f16_imm_a( ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0.h ; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %one ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800 +; GFX11-TRUE16-NEXT: s_branch .LBB1_3 ; GFX11-TRUE16-NEXT: .LBB1_2: ; %two +; GFX11-TRUE16-NEXT: .LBB1_3: ; %one ; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -298,13 +301,15 @@ define amdgpu_kernel void @br_cc_f16_imm_b( ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3 ; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2 -; GFX11-TRUE16-NEXT: ; %bb.1: ; %two +; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0.h +; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB2_2 +; GFX11-TRUE16-NEXT: ; %bb.1: ; %one +; GFX11-TRUE16-NEXT: s_branch .LBB2_3 +; GFX11-TRUE16-NEXT: .LBB2_2: ; %two ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800 -; GFX11-TRUE16-NEXT: .LBB2_2: ; %one +; GFX11-TRUE16-NEXT: .LBB2_3: ; %one ; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index f9db082a2e912..9b6a2f3a1aa1e 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -57,6 +57,57 @@ body: | %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... +--- +name: salu16_usedby_salu32 +body: | + bb.0: + ; GCN-LABEL: name: salu16_usedby_salu32 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16 + ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode + %3:sreg_32 = S_XOR_B32 %2:sreg_32, %1:sreg_32, implicit-def $scc +... + +--- +name: salu32_usedby_salu16 +body: | + bb.0: + ; GCN-LABEL: name: salu32_usedby_salu16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec + ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = S_XOR_B32 %1:sreg_32, %1:sreg_32, implicit-def $scc + %3:sreg_32 = S_TRUNC_F16 %2:sreg_32, implicit $mode +... + +--- +name: S_FMAC_F16 +body: | + bb.0: + ; GCN-LABEL: name: S_FMAC_F16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16 + ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16 + ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0:vgpr_16 + %2:sreg_32 = COPY %0:vgpr_16 + %3:sreg_32 = COPY %1:sgpr_lo16 + %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode +... + --- name: vgpr16_to_spgr32 body: | diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 125d009429cbf..7a1351174733b 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -6,7 +6,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f16: @@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: fast_frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: fast_frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: fast_frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: unsafe_frem_f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 -; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 -; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: unsafe_frem_f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h +; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: unsafe_frem_f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2 +; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 @@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_v2f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v0, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5 -; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5 -; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 -; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_v2f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l +; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_v2f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3 +; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 @@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX1150-LABEL: frem_v4f16: -; GFX1150: ; %bb.0: -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-NEXT: v_mov_b32_e32 v4, 0 -; GFX1150-NEXT: s_waitcnt lgkmcnt(0) -; GFX1150-NEXT: s_clause 0x1 -; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] -; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 -; GFX1150-NEXT: s_waitcnt vmcnt(1) -; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX1150-NEXT: s_waitcnt vmcnt(0) -; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v8, v8 -; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8 -; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0 -; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 -; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0 -; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 -; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7 -; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 -; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5 -; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2 -; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] -; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 -; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 -; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 -; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] -; GFX1150-NEXT: s_endpgm +; GFX1150-TRUE16-LABEL: frem_v4f16: +; GFX1150-TRUE16: ; %bb.0: +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0 +; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-TRUE16-NEXT: s_clause 0x1 +; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h +; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX1150-TRUE16-NEXT: s_endpgm +; +; GFX1150-FAKE16-LABEL: frem_v4f16: +; GFX1150-FAKE16: ; %bb.0: +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1150-FAKE16-NEXT: s_clause 0x1 +; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6 +; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0 +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2 +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6 +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5 +; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5 +; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3 +; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1150-FAKE16-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16