diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..ed52a56355486 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -28,17 +28,26 @@ /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of /// the VGPR_32, the COPY can be completely eliminated. /// +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16, +/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be +/// co-issued. This helps with overlapping MFMA and certain vector instructions +/// in machine schedules and is expected to improve performance. Only those +/// packed instructions are unpacked that are overlapped by the MFMA latency. +/// Rest should remain untouched. +/// TODO: Add support for F16 packed instructions //===----------------------------------------------------------------------===// #include "GCNPreRAOptimizations.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/InitializePasses.h" - using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" @@ -53,6 +62,38 @@ class GCNPreRAOptimizationsImpl { LiveIntervals *LIS; bool processReg(Register Reg); + // Creates a list of packed instructions following an MFMA that are suitable + // for unpacking. + bool createListOfPackedInstr(MachineInstr &BeginMI, + SetVector &InstrsToUnpack, + uint16_t NumMFMACycles); + // Check if the machine instruction being processed is a supported packed + // instruction + bool isUnpackingSupportedInstr(MachineInstr &MI) const; + // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. + // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this + // transformation. + void processF32Unpacking(MachineInstr &I); + // Select corresponding unpacked instruction from packed instruction as input + uint16_t mapToUnpackedOpcode(MachineInstr &I); + // Insert appropriate unpacked instructions into the BB + void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA); + // Creates the unpacked instruction to be inserted. Adds source modifiers to + // the unpacked instructions based on the source modifiers in the packed + // instruction + MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, + const DebugLoc &DL, + uint16_t UnpackedOpcode, bool IsHiBits, + bool IsFMA); + // Identify register dependencies between those used by the MFMA + // instruction and the following packed instructions. Conservatively ensures + // that we do not incorrectly read/write registers. + bool hasReadWriteDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI); + + void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods, + unsigned NegModifier, unsigned OpSelModifier, + MachineOperand &SrcMO); public: GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} @@ -225,6 +266,228 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { return true; } +bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( + MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_FMA_F32: + return true; + default: + return false; + } + llvm_unreachable("Fully covered switch"); +} + +uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { + unsigned Opcode = I.getOpcode(); + // Use 64 bit encoding to allow use of VOP3 instructions. + // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 + // e32 instructions are VOP2 and don't allow source modifiers + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_FMA_F32: + return AMDGPU::V_FMA_F32_e64; + default: + return std::numeric_limits::max(); + } + llvm_unreachable("Fully covered switch"); +} + +bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies( + const MachineInstr &PredMI, const MachineInstr &SuccMI) { + for (const MachineOperand &Pred_Ops : PredMI.operands()) { + if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) + continue; + Register Pred_Reg = Pred_Ops.getReg(); + if (!Pred_Reg.isValid()) + continue; + for (const MachineOperand &Succ_Ops : SuccMI.operands()) { + if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) + continue; + Register Succ_Reg = Succ_Ops.getReg(); + if (!Succ_Reg.isValid()) + continue; + if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { + return true; + } + } + } + return false; +} +bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( + MachineInstr &BeginMI, SetVector &InstrsToUnpack, + uint16_t NumMFMACycles) { + auto *BB = BeginMI.getParent(); + auto E = BB->end(); + int TotalCyclesBetweenCandidates = 0; + auto SchedModel = TII->getSchedModel(); + for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { + MachineInstr &Instr = *I; + const MCSchedClassDesc *InstrSchedClassDesc = + SchedModel.resolveSchedClass(&Instr); + TotalCyclesBetweenCandidates += + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + + if (Instr.isMetaInstruction()) + continue; + if (Instr.isTerminator()) + return false; + if (TotalCyclesBetweenCandidates > NumMFMACycles) + return false; + if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { + if (hasReadWriteDependencies(BeginMI, Instr)) + return false; + + // If it is a packed instruction, we should subtract it's latency from the + // overall latency calculation here, because the packed instruction will + // be removed and replaced by 2 unpacked instructions + TotalCyclesBetweenCandidates -= + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + // We're adding 2 to account for the extra latency added by unpacking into + // 2 instructions. At the time of writing, the considered unpacked + // instructions have latency of 1. + // TODO: improve latency handling of possible inserted instructions + TotalCyclesBetweenCandidates += 2; + if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) + InstrsToUnpack.insert(&Instr); + } + } + return true; +} + +void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I, + bool IsVreg_64, + bool IsFMA) { + MachineBasicBlock &MBB = *I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = I.getOperand(0).getReg(); + + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; + + MachineInstrBuilder Op0L_Op1L = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA); + if (IsVreg_64 || I.getOperand(0).isUndef()) + Op0L_Op1L->getOperand(0).setIsUndef(); + LIS->InsertMachineInstrInMaps(*Op0L_Op1L); + + MachineInstrBuilder Op0H_Op1H = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA); + LIS->InsertMachineInstrInMaps(*Op0H_Op1H); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + } + if (I.getFlag(MachineInstr::MIFlag::FmContract)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); + } + + LIS->RemoveMachineInstrFromMaps(I); + I.eraseFromParent(); + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + return; +} + +void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI, + unsigned Src_Mods, + unsigned NegModifier, + unsigned OpSelModifier, + MachineOperand &SrcMO) { + unsigned New_Src_Mods = 0; + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // NEG_HI shares the same bit position with ABS. But packed instructions do + // not support ABS. Therefore, NEG_HI must be translated to NEG source + // modifier for the higher 32 bits. Unpacked VOP3 instructions do support + // ABS, therefore we need to explicitly add the NEG modifier if present in + // the packed instruction + if (Src_Mods & NegModifier) { + New_Src_Mods |= SISrcMods::NEG; + } + // Src modifiers. Only negative modifiers are added if needed. Unpacked + // operations do not have op_sel, therefore it must be handled explicitly as + // done below. Unpacked operations support abs, but packed instructions do + // not. Thus, abs is not handled. + NewMI.addImm(New_Src_Mods); + if (SrcMO.isImm()) { + NewMI.addImm(SrcMO.getImm()); + } else { + if (Src_Mods & OpSelModifier) { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1); + NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0 + } else { + unsigned Src0SubIdx = + TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0); + // If op_sel == 0, select register 0 of reg:sub0_sub1 + NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); + } + } +} + +MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( + MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, + uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) { + MachineOperand &DstMO = I.getOperand(0); + MachineOperand &SrcMO1 = I.getOperand(2); + MachineOperand &SrcMO2 = I.getOperand(4); + Register DstReg = DstMO.getReg(); + unsigned DestSubIdx = + IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) + : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); + int ClampIdx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int64_t ClampVal = I.getOperand(ClampIdx).getImm(); + int Src0_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int Src1_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); + unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); + // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. + unsigned New_Src0_Mods = 0; + unsigned New_Src1_Mods = 0; + + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + NewMI.addDef(DstReg, 0, DestSubIdx); // vdst + addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1); + addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2); + + if (IsFMA) { + MachineOperand &SrcMO3 = I.getOperand(6); + int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( + I.getOpcode(), AMDGPU::OpName::src2_modifiers); + unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); + addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3); + } + NewMI.addImm(ClampVal); // clamp + // Packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + NewMI.addImm(0); // omod + return NewMI; +} + +void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { + bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false; + const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); + bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); + insertUnpackedF32MI(I, IsVReg64, IsFMA); + return; +} + bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -260,38 +523,57 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { Changed |= processReg(Reg); } - if (!ST.useRealTrue16Insts()) - return Changed; - // Add RA hints to improve True16 COPY elimination. - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() != AMDGPU::COPY) - continue; - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - if (Dst.isVirtual() && - MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - Src.isPhysical() && - TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); - if (Src.isVirtual() && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && - Dst.isPhysical() && - TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); - if (!Dst.isVirtual() || !Src.isVirtual()) - continue; - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { - MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); - MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); + // Unpack packed instructions to overlap MFMAs. This allows the compiler to + // co-issue unpacked instructions with MFMA + for (MachineBasicBlock &MBB : MF) { + SetVector InstrsToUnpack; + SetVector WriteOperands; + SetVector ReadOperands; + uint16_t NumMFMACycles = 0; + auto SchedModel = TII->getSchedModel(); + for (MachineInstr &MI : MBB) { + if (SIInstrInfo::isMFMA(MI)) { + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles); + } + if (ST.useRealTrue16Insts()) { + if (MI.getOpcode() != AMDGPU::COPY) + continue; + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (Dst.isVirtual() && + MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && + Src.isPhysical() && + TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); + if (Src.isVirtual() && + MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && + Dst.isPhysical() && + TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); + if (!Dst.isVirtual() || !Src.isVirtual()) + continue; + if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && + MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { + MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); + MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); + } + if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && + MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) + MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } - if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && - MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) - MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); } - } + if (!InstrsToUnpack.empty()) { + for (MachineInstr *MI : InstrsToUnpack) { + processF32Unpacking(*MI); + } + } + } + LIS->reanalyze(MF); return Changed; -} +} \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d500858841a41..7fb08a36de96c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6359,6 +6359,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { + bool IsGFX950Only = ST.hasGFX950Insts(); + bool IsGFX940Only = ST.hasGFX940Insts(); + + if (!IsGFX950Only && !IsGFX940Only) + return false; + + if (!isVALU(MI)) + return false; + + // V_COS, V_EXP, V_RCP, etc. + if (isTRANS(MI)) + return true; + + // DOT2, DOT2C, DOT4, etc. + if (isDOT(MI)) + return true; + + // MFMA, SMFMA + if (isMFMA(MI)) + return true; + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + default: + return false; + } +} + void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c964d02ee2b97..25ba2c4bea785 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1187,6 +1187,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo, const MachineOperand &MO) const; + bool isNeverCoissue(MachineInstr &MI) const; + bool isLiteralOperandLegal(const MCInstrDesc &InstDesc, const MCOperandInfo &OpInfo) const; @@ -1200,7 +1202,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; - /// Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir new file mode 100644 index 0000000000000..6b871b1d1881b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir @@ -0,0 +1,154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_pk_mul_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + + ; GCN-LABEL: name: test_pk_mul_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + + ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 12, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_hi_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0 + ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0 + ; GCN-NEXT: KILL %1.sub6_sub7 + ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]] + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]] + ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6 + ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7 + ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4 + ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5 + ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + %3:sgpr_64(p4) = COPY $sgpr4_sgpr5 + early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0 + %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0 + %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0 + early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0 + KILL %8.sub6_sub7 + early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0 + %57:vreg_128_align2 = COPY %22 + %58:vreg_128_align2 = COPY %23 + undef %69.sub0:vreg_64_align2 = COPY %39.sub6 + %69.sub1:vreg_64_align2 = COPY %39.sub7 + undef %75.sub0:vreg_64_align2 = COPY %39.sub4 + undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec + %75.sub1:vreg_64_align2 = COPY %39.sub5 + %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 +