diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..ed52a56355486 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,17 +28,26 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
+/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
+/// co-issued. This helps with overlapping MFMA and certain vector instructions
+/// in machine schedules and is expected to improve performance. Only those
+/// packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
+/// TODO: Add support for F16 packed instructions
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/InitializePasses.h"
-
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -53,6 +62,38 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  // Creates a list of packed instructions following an MFMA that are suitable
+  // for unpacking.
+  bool createListOfPackedInstr(MachineInstr &BeginMI,
+                               SetVector<MachineInstr *> &InstrsToUnpack,
+                               uint16_t NumMFMACycles);
+  // Check if the machine instruction being processed is a supported packed
+  // instruction
+  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+  // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
+  // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
+  // transformation.
+  void processF32Unpacking(MachineInstr &I);
+  // Select corresponding unpacked instruction from packed instruction as input
+  uint16_t mapToUnpackedOpcode(MachineInstr &I);
+  // Insert appropriate unpacked instructions into the BB
+  void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA);
+  // Creates the unpacked instruction to be inserted. Adds source modifiers to
+  // the unpacked instructions based on the source modifiers in the packed
+  // instruction
+  MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+                                       const DebugLoc &DL,
+                                       uint16_t UnpackedOpcode, bool IsHiBits,
+                                       bool IsFMA);
+  // Identify register dependencies between those used by the MFMA
+  // instruction and the following packed instructions. Conservatively ensures
+  // that we do not incorrectly read/write registers.
+  bool hasReadWriteDependencies(const MachineInstr &PredMI,
+                                const MachineInstr &SuccMI);
+
+  void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+                         unsigned NegModifier, unsigned OpSelModifier,
+                         MachineOperand &SrcMO);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -225,6 +266,228 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
+    MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
+  // Use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(
+    const MachineInstr &PredMI, const MachineInstr &SuccMI) {
+  for (const MachineOperand &Pred_Ops : PredMI.operands()) {
+    if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+      continue;
+    Register Pred_Reg = Pred_Ops.getReg();
+    if (!Pred_Reg.isValid())
+      continue;
+    for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
+      if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+        continue;
+      Register Succ_Reg = Succ_Ops.getReg();
+      if (!Succ_Reg.isValid())
+        continue;
+      if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+    uint16_t NumMFMACycles) {
+  auto *BB = BeginMI.getParent();
+  auto E = BB->end();
+  int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *InstrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+    if (Instr.isMetaInstruction())
+      continue;
+    if (Instr.isTerminator())
+      return false;
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
+      return false;
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+      if (hasReadWriteDependencies(BeginMI, Instr))
+        return false;
+
+      // If it is a packed instruction, we should subtract it's latency from the
+      // overall latency calculation here, because the packed instruction will
+      // be removed and replaced by 2 unpacked instructions
+      TotalCyclesBetweenCandidates -=
+          SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+      // We're adding 2 to account for the extra latency added by unpacking into
+      // 2 instructions. At the time of writing, the considered unpacked
+      // instructions have latency of 1.
+      // TODO: improve latency handling of possible inserted instructions
+      TotalCyclesBetweenCandidates += 2;
+      if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
+        InstrsToUnpack.insert(&Instr);
+    }
+  }
+  return true;
+}
+
+void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I,
+                                                    bool IsVreg_64,
+                                                    bool IsFMA) {
+  MachineBasicBlock &MBB = *I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = I.getOperand(0).getReg();
+
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+
+  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
+      MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
+  if (IsVreg_64 || I.getOperand(0).isUndef())
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+
+  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
+      MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+  }
+
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  return;
+}
+
+void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI,
+                                                  unsigned Src_Mods,
+                                                  unsigned NegModifier,
+                                                  unsigned OpSelModifier,
+                                                  MachineOperand &SrcMO) {
+  unsigned New_Src_Mods = 0;
+  //  If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+  //  lane.
+  //  NEG_HI shares the same bit position with ABS. But packed instructions do
+  //  not support ABS. Therefore, NEG_HI must be translated to NEG source
+  //  modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+  //  ABS, therefore we need to explicitly add the NEG modifier if present in
+  //  the packed instruction
+  if (Src_Mods & NegModifier) {
+    New_Src_Mods |= SISrcMods::NEG;
+  }
+  // Src modifiers. Only negative modifiers are added if needed. Unpacked
+  // operations do not have op_sel, therefore it must be handled explicitly as
+  // done below. Unpacked operations support abs, but packed instructions do
+  // not. Thus, abs is not handled.
+  NewMI.addImm(New_Src_Mods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+  } else {
+    if (Src_Mods & OpSelModifier) {
+      unsigned Src0SubIdx =
+          TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1);
+      NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0
+    } else {
+      unsigned Src0SubIdx =
+          TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0);
+      // If op_sel == 0, select register 0 of reg:sub0_sub1
+      NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx);
+    }
+  }
+}
+
+MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
+    MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL,
+    uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) {
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO1 = I.getOperand(2);
+  MachineOperand &SrcMO2 = I.getOperand(4);
+  Register DstReg = DstMO.getReg();
+  unsigned DestSubIdx =
+      IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
+               : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+  int ClampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+  int Src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int Src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+
+  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+  unsigned New_Src0_Mods = 0;
+  unsigned New_Src1_Mods = 0;
+
+  unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+  unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  NewMI.addDef(DstReg, 0, DestSubIdx); // vdst
+  addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
+  addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);
+
+  if (IsFMA) {
+    MachineOperand &SrcMO3 = I.getOperand(6);
+    int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
+        I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+    unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
+    addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
+  }
+  NewMI.addImm(ClampVal); // clamp
+  // Packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  NewMI.addImm(0); // omod
+  return NewMI;
+}
+
+void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
+  bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
+  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
+  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+  insertUnpackedF32MI(I, IsVReg64, IsFMA);
+  return;
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -260,38 +523,57 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
-  if (!ST.useRealTrue16Insts())
-    return Changed;
-
   // Add RA hints to improve True16 COPY elimination.
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::COPY)
-        continue;
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src = MI.getOperand(1).getReg();
-      if (Dst.isVirtual() &&
-          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          Src.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
-      if (Src.isVirtual() &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
-      if (!Dst.isVirtual() || !Src.isVirtual())
-        continue;
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
-        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to
+  // co-issue unpacked instructions with MFMA
+  for (MachineBasicBlock &MBB : MF) {
+    SetVector<MachineInstr *> InstrsToUnpack;
+    SetVector<MachineOperand *> WriteOperands;
+    SetVector<MachineOperand *> ReadOperands;
+    uint16_t NumMFMACycles = 0;
+    auto SchedModel = TII->getSchedModel();
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)) {
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
+      }
+      if (ST.useRealTrue16Insts()) {
+        if (MI.getOpcode() != AMDGPU::COPY)
+          continue;
+        Register Dst = MI.getOperand(0).getReg();
+        Register Src = MI.getOperand(1).getReg();
+        if (Dst.isVirtual() &&
+            MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            Src.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+        if (Src.isVirtual() &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+            Dst.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+        if (!Dst.isVirtual() || !Src.isVirtual())
+          continue;
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+          MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+        }
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
     }
-  }
 
+    if (!InstrsToUnpack.empty()) {
+      for (MachineInstr *MI : InstrsToUnpack) {
+        processF32Unpacking(*MI);
+      }
+    }
+  }
+  LIS->reanalyze(MF);
   return Changed;
-}
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d500858841a41..7fb08a36de96c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6359,6 +6359,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+  bool IsGFX950Only = ST.hasGFX950Insts();
+  bool IsGFX940Only = ST.hasGFX940Insts();
+  
+  if (!IsGFX950Only && !IsGFX940Only)
+    return false;
+  
+  if (!isVALU(MI))
+    return false;
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    case AMDGPU::V_CVT_PK_BF8_F32_e64:
+    case AMDGPU::V_CVT_PK_FP8_F32_e64:
+    case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+    case AMDGPU::V_MQSAD_U32_U8_e64:
+    case AMDGPU::V_PK_ADD_F16:
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_ADD_I16:
+    case AMDGPU::V_PK_ADD_U16:
+    case AMDGPU::V_PK_ASHRREV_I16:
+    case AMDGPU::V_PK_FMA_F16:
+    case AMDGPU::V_PK_FMA_F32:
+    case AMDGPU::V_PK_FMAC_F16_e32:
+    case AMDGPU::V_PK_FMAC_F16_e64:
+    case AMDGPU::V_PK_LSHLREV_B16:
+    case AMDGPU::V_PK_LSHRREV_B16:
+    case AMDGPU::V_PK_MAD_I16:
+    case AMDGPU::V_PK_MAD_U16:
+    case AMDGPU::V_PK_MAX_F16:
+    case AMDGPU::V_PK_MAX_I16:
+    case AMDGPU::V_PK_MAX_U16:
+    case AMDGPU::V_PK_MIN_F16:
+    case AMDGPU::V_PK_MIN_I16:
+    case AMDGPU::V_PK_MIN_U16:
+    case AMDGPU::V_PK_MOV_B32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_LO_U16:
+    case AMDGPU::V_PK_SUB_I16:
+    case AMDGPU::V_PK_SUB_U16:
+    case AMDGPU::V_QSAD_PK_U16_U8_e64:
+      return true;
+    default:
+      return false;
+    }
+}
+
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c964d02ee2b97..25ba2c4bea785 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1187,6 +1187,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isImmOperandLegal(const MCInstrDesc &InstDesc, unsigned OpNo,
                          const MachineOperand &MO) const;
 
+  bool isNeverCoissue(MachineInstr &MI) const;
+
   bool isLiteralOperandLegal(const MCInstrDesc &InstDesc,
                              const MCOperandInfo &OpInfo) const;
 
@@ -1200,7 +1202,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
   bool isLegalAV64PseudoImm(uint64_t Imm) const;
-
   /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
new file mode 100644
index 0000000000000..6b871b1d1881b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
@@ -0,0 +1,154 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            test_pk_mul_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_pk_mul_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 12, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_hi_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+    ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+