diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 8569aa7127dc3..dd87b196a24ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,12 +14,7 @@
 /// MFMA opcode.
 ///
 /// TODO:
-///  - Handle non-tied dst+src2 cases. We need to try to find a copy from an
-///    AGPR from src2, or reassign src2 to an available AGPR (which should work
-///    in the common case of a load).
-///
-///  - Handle multiple MFMA uses of the same register. e.g. chained MFMAs that
-///    can be rewritten as a set
+///  - Handle SplitKit partial copy bundles, and not just full copy instructions
 ///
 ///  - Update LiveIntervals incrementally instead of recomputing from scratch
 ///
@@ -49,13 +44,18 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   VirtRegMap &VRM;
   LiveRegMatrix &LRM;
   LiveIntervals &LIS;
+  const RegisterClassInfo &RegClassInfo;
+
+  bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
+                                  MCPhysReg PrefPhysReg) const;
 
 public:
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
-                                LiveRegMatrix &LRM, LiveIntervals &LIS)
+                                LiveRegMatrix &LRM, LiveIntervals &LIS,
+                                const RegisterClassInfo &RegClassInfo)
       : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
-        LIS(LIS) {}
+        LIS(LIS), RegClassInfo(RegClassInfo) {}
 
   bool isRewriteCandidate(const MachineInstr &MI) const {
     return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -64,10 +64,10 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   /// Compute the register class constraints based on the uses of \p Reg,
   /// excluding uses from \p ExceptMI. This should be nearly identical to
   /// MachineRegisterInfo::recomputeRegClass.
-  const TargetRegisterClass *
-  recomputeRegClassExceptRewritable(Register Reg,
-                                    const TargetRegisterClass *OldRC,
-                                    const TargetRegisterClass *NewRC) const;
+  const TargetRegisterClass *recomputeRegClassExceptRewritable(
+      Register Reg, const TargetRegisterClass *OldRC,
+      const TargetRegisterClass *NewRC,
+      SmallVectorImpl<MachineInstr *> &RewriteCandidates) const;
 
   bool run(MachineFunction &MF) const;
 };
@@ -75,7 +75,8 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
 const TargetRegisterClass *
 AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
     Register Reg, const TargetRegisterClass *OldRC,
-    const TargetRegisterClass *NewRC) const {
+    const TargetRegisterClass *NewRC,
+    SmallVectorImpl<MachineInstr *> &RewriteCandidates) const {
 
   // Accumulate constraints from all uses.
   for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
@@ -86,8 +87,11 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
     // effects of rewrite candidates. It just so happens that we can use either
     // AGPR or VGPR in src0/src1, so don't bother checking the constraint
     // effects of the individual operands.
-    if (isRewriteCandidate(*MI))
+    if (isRewriteCandidate(*MI)) {
+      if (!is_contained(RewriteCandidates, MI))
+        RewriteCandidates.push_back(MI);
       continue;
+    }
 
     unsigned OpNo = &MO - &MI->getOperand(0);
     NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
@@ -98,6 +102,58 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
   return NewRC;
 }
 
+/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
+/// preference to use \p PhysReg first. Returns false if the reassignments
+/// cannot be trivially performed.
+bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR(
+    SmallSetVector<Register, 4> &InterferingRegs, MCPhysReg PrefPhysReg) const {
+  // FIXME: The ordering may matter here, but we're just taking uselistorder
+  // with the special case of ensuring to process the starting instruction
+  // first. We probably should extract the priority advisor out of greedy and
+  // use that ordering.
+  for (Register InterferingReg : InterferingRegs) {
+    LiveInterval &ReassignLI = LIS.getInterval(InterferingReg);
+    const TargetRegisterClass *EquivalentAGPRRegClass =
+        TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
+
+    MCPhysReg Assignable = AMDGPU::NoRegister;
+    if (EquivalentAGPRRegClass->contains(PrefPhysReg) &&
+        LRM.checkInterference(ReassignLI, PrefPhysReg) ==
+            LiveRegMatrix::IK_Free) {
+      // First try to assign to the AGPR we were already copying to. This
+      // should be the first assignment we attempt. We have to guard
+      // against the use being a subregister (which doesn't have an exact
+      // class match).
+
+      // TODO: If this does happen to be a subregister use, we should
+      // still try to assign to a subregister of the original copy result.
+      Assignable = PrefPhysReg;
+    } else {
+      ArrayRef<MCPhysReg> AllocOrder =
+          RegClassInfo.getOrder(EquivalentAGPRRegClass);
+      for (MCPhysReg Reg : AllocOrder) {
+        if (LRM.checkInterference(ReassignLI, Reg) == LiveRegMatrix::IK_Free) {
+          Assignable = Reg;
+          break;
+        }
+      }
+    }
+
+    if (!Assignable) {
+      LLVM_DEBUG(dbgs() << "Unable to reassign VGPR "
+                        << printReg(InterferingReg, &TRI)
+                        << " to a free AGPR\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "Reassigning VGPR " << printReg(InterferingReg, &TRI)
+                      << " to " << printReg(Assignable, &TRI) << '\n');
+    LRM.assign(ReassignLI, Assignable);
+  }
+
+  return true;
+}
+
 bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
   // This only applies on subtargets that have a configurable AGPR vs. VGPR
   // allocation.
@@ -127,7 +183,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
 
     LiveInterval &LI = LIS.getInterval(VReg);
 
-    // TODO: Test multiple uses
     for (VNInfo *VNI : LI.vnis()) {
       MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
 
@@ -136,22 +191,27 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       if (!DefMI || !DefMI->isFullCopy())
         continue;
 
-      Register CopySrcReg = DefMI->getOperand(1).getReg();
-      if (!CopySrcReg.isVirtual())
+      Register MFMADstReg = DefMI->getOperand(1).getReg();
+      if (!MFMADstReg.isVirtual())
         continue;
 
-      LiveInterval &CopySrcLI = LIS.getInterval(CopySrcReg);
+      LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg);
       LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot());
-      MachineInstr *CopySrcMI = LIS.getInstructionFromIndex(LRQ.valueIn()->def);
-      if (!CopySrcMI)
+      MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def);
+      if (!MFMA)
         continue;
 
-      int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(CopySrcMI->getOpcode());
+      int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MFMA->getOpcode());
       if (AGPROp == -1)
         continue;
 
-      MachineOperand *Src2 =
-          TII.getNamedOperand(*CopySrcMI, AMDGPU::OpName::src2);
+      MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
+      if (!Src2->isReg())
+        continue;
+
+      Register Src2Reg = Src2->getReg();
+      if (!Src2Reg.isVirtual())
+        continue;
 
       // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
       // of an AGPR or VGPR subclass, so we can't simply use the result on the
@@ -163,28 +223,22 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
                << " Dst=[" << printReg(VReg) << " => "
                << printReg(PhysReg, &TRI) << "], Src2=["
                << printReg(Src2->getReg(), &TRI) << " => "
-               << printReg(Src2PhysReg, &TRI) << "]: " << *CopySrcMI;
+               << printReg(Src2PhysReg, &TRI) << "]: " << *MFMA;
       });
 
-      // If the inputs are tied and the same register, we can shortcut and
-      // directly replace the register.
-      if (!Src2->isReg() || Src2->getReg() != CopySrcReg ||
-          Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
-        LLVM_DEBUG(
-            dbgs()
-            << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n");
-        // TODO: Only handles the tied case for now. If the input operand is a
-        // different register, we need to also reassign it (either by looking
-        // for a compatible copy-from-AGPR, or by seeing if an available AGPR is
-        // compatible with all other uses.
+      const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(Src2->getReg());
+      const TargetRegisterClass *NewDstConstraintRC =
+          TII.getRegClass(TII.get(AGPROp), 0, &TRI, MF);
+      const TargetRegisterClass *NewSrc2ConstraintRC = NewDstConstraintRC;
 
-        // If we can't reassign it, we'd need to introduce a different copy
-        // which is likely worse than the copy we'd be saving.
-        continue;
-      }
+      assert(NewSrc2ConstraintRC == TII.getRegClass(TII.get(AGPROp),
+                                                    Src2->getOperandNo(), &TRI,
+                                                    MF) &&
+             "expected src2 and dst to have same class constraint");
 
-      const TargetRegisterClass *Src2VirtRegRC =
-          MRI.getRegClass(Src2->getReg());
+      const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg);
+
+      SmallVector<MachineInstr *, 4> DstRewriteCandidates;
 
       // We've found av = COPY (MFMA), and need to verify that we can trivially
       // rewrite src2 to use the new AGPR. If we can't trivially replace it,
@@ -192,35 +246,129 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       // first place, as well as need to assign another register, and need to
       // figure out where to put them. The live range splitting is smarter than
       // anything we're doing here, so trust it did something reasonable.
-      const TargetRegisterClass *Src2ExceptRC =
-          recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC,
-                                            VirtRegRC);
-      if (!Src2ExceptRC) {
-        LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n");
+      const TargetRegisterClass *DstExceptRC =
+          recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
+                                            DstRewriteCandidates);
+      if (!DstExceptRC) {
+        LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
+                          << printReg(MFMADstReg, &TRI) << '\n');
         continue;
       }
 
-      const TargetRegisterClass *NewSrc2ConstraintRC =
-          TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF);
-
       // Try to constrain src2 to the replacement instruction candidate's
       // register class.
-      const TargetRegisterClass *NewSrc2RC =
-          TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
-      if (!NewSrc2RC) {
-        LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI)
+      const TargetRegisterClass *NewDstRC =
+          TRI.getCommonSubClass(DstExceptRC, NewDstConstraintRC);
+      if (!NewDstRC) {
+        LLVM_DEBUG(dbgs() << "Other uses of " << printReg(MFMADstReg, &TRI)
                           << " are incompatible with replacement class\n");
         continue;
       }
 
+      // If the inputs are tied and the same register, we can shortcut and
+      // directly replace the register.
+      if (Src2->getReg() != MFMADstReg ||
+          Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) {
+        // If src2 and dst are different registers, we need to also reassign the
+        // input to an available AGPR if it is compatible with all other uses.
+        //
+        // If we can't reassign it, we'd need to introduce a different copy
+        // which is likely worse than the copy we'd be saving.
+        SmallVector<MachineInstr *, 4> Src2RewriteCandidates;
+        const TargetRegisterClass *Src2ExceptRC =
+            recomputeRegClassExceptRewritable(Src2Reg, Src2RC, VirtRegRC,
+                                              Src2RewriteCandidates);
+        if (!Src2ExceptRC) {
+          LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
+                            << printReg(Src2Reg, &TRI) << '\n');
+          continue;
+        }
+
+        const TargetRegisterClass *NewSrc2RC =
+            TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC);
+        if (!NewSrc2RC) {
+          LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2Reg, &TRI)
+                            << " are incompatible with AGPR replacement\n");
+          continue;
+        }
+
+        // It's likely that the MFMA is used in sequence with other MFMAs; if we
+        // cannot migrate the full use/def chain of MFMAs, we would need to
+        // introduce intermediate copies somewhere. So we only make the
+        // transform if all the interfering MFMAs can also be migrated. Collect
+        // the set of rewritable MFMAs and check if we can assign an AGPR at
+        // that point.
+        //
+        // If any of the MFMAs aren't reassignable, we give up and rollback to
+        // the original register assignments.
+
+        using RecoloringStack =
+            SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
+
+        SmallSetVector<Register, 4> InterferingRegs;
+
+        // Make sure we reassign the MFMA we found the copy from first. We want
+        // to ensure dst ends up in the physreg we were originally copying to.
+        InterferingRegs.insert(MFMADstReg);
+
+        RecoloringStack TentativeReassignments;
+
+        for (MachineInstr *RewriteCandidate : Src2RewriteCandidates) {
+          MachineOperand *CandDst =
+              TII.getNamedOperand(*RewriteCandidate, AMDGPU::OpName::vdst);
+          MachineOperand *CandSrc2 =
+              TII.getNamedOperand(*RewriteCandidate, AMDGPU::OpName::src2);
+
+          InterferingRegs.insert(CandDst->getReg());
+          if (CandDst->getReg() != CandSrc2->getReg())
+            InterferingRegs.insert(CandSrc2->getReg());
+        }
+
+        for (Register InterferingReg : InterferingRegs) {
+          LiveInterval &LI = LIS.getInterval(InterferingReg);
+          TentativeReassignments.push_back({&LI, VRM.getPhys(InterferingReg)});
+          LRM.unassign(LI);
+        }
+
+        if (!attemptReassignmentsToAGPR(InterferingRegs, PhysReg)) {
+          // Roll back the register assignments to the original state.
+          for (auto [LI, OldAssign] : TentativeReassignments) {
+            if (VRM.hasPhys(LI->reg()))
+              LRM.unassign(*LI);
+            LRM.assign(*LI, OldAssign);
+          }
+
+          continue;
+        }
+
+        // Fixup the register classes of the virtual registers now that we've
+        // committed to the reassignments.
+        for (Register InterferingReg : InterferingRegs) {
+          const TargetRegisterClass *EquivalentAGPRRegClass =
+              TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg));
+          MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass);
+        }
+
+        for (MachineInstr *RewriteCandidate : Src2RewriteCandidates) {
+          int NewMFMAOp =
+              AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode());
+          RewriteCandidate->setDesc(TII.get(NewMFMAOp));
+        }
+
+        // We likely left an identity copy behind after assignment; let
+        // VirtRegRewriter deal with it later.
+        MadeChange = true;
+        continue;
+      }
+
       MRI.setRegClass(VReg, AssignedRC);
-      MRI.setRegClass(Src2->getReg(), NewSrc2RC);
+      MRI.setRegClass(MFMADstReg, NewDstRC);
 
-      CopySrcMI->setDesc(TII.get(AGPROp));
+      MFMA->setDesc(TII.get(AGPROp));
 
       // Perform replacement of the register, rewriting the rewritable uses.
       for (MachineInstr &UseMI :
-           make_early_inc_range(MRI.reg_instructions(CopySrcReg))) {
+           make_early_inc_range(MRI.reg_instructions(MFMADstReg))) {
         if (TII.isMAI(UseMI)) {
           // Note the register we need to rewrite may still appear in src0/src1,
           // but that's fine since those can use A or V anyway.
@@ -229,10 +377,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
             UseMI.setDesc(TII.get(ReplacementOp));
         }
 
-        UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI);
+        UseMI.substituteRegister(MFMADstReg, VReg, AMDGPU::NoSubRegister, TRI);
       }
 
-      LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI);
+      LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *MFMA);
 
       // We left behind an identity copy, so delete it.
       LIS.RemoveMachineInstrFromMaps(*DefMI);
@@ -243,7 +391,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       // We don't need the liveness information anymore, so don't bother
       // updating the intervals. Just delete the stale information.
       // TODO: Is it worth preserving these?
-      LIS.removeInterval(CopySrcReg);
+      LIS.removeInterval(MFMADstReg);
       LIS.removeInterval(VReg);
       LIS.createAndComputeVirtRegInterval(VReg);
 
@@ -257,6 +405,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
 class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
 public:
   static char ID;
+  RegisterClassInfo RegClassInfo;
 
   AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) {
     initializeAMDGPURewriteAGPRCopyMFMALegacyPass(
@@ -302,11 +451,13 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
   if (skipFunction(MF.getFunction()))
     return false;
 
+  RegClassInfo.runOnMachineFunction(MF);
+
   auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
   auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
   auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
   return Impl.run(MF);
 }
 
@@ -316,8 +467,10 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
   VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
   LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
   LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  RegisterClassInfo RegClassInfo;
+  RegClassInfo.runOnMachineFunction(MF);
 
-  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS);
+  AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo);
   if (!Impl.run(MF))
     return PreservedAnalyses::all();
   auto PA = getMachineFunctionPassPreservedAnalyses();
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index c3f6af3854eeb..8cce41360ab82 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -209,15 +209,14 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vcc, $vgpr2_vgpr3
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF
+  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -470,18 +469,17 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vcc, $vgpr16_vgpr17
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF
+  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -557,18 +555,17 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vcc, $vgpr18_vgpr19
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   early-clobber renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -645,16 +642,15 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF
+  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
@@ -731,16 +727,15 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vcc, $vgpr18_vgpr19
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_16X16X16F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X16F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr2_agpr3_agpr4_agpr5, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23