diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index 8569aa7127dc3..dd87b196a24ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -14,12 +14,7 @@ /// MFMA opcode. /// /// TODO: -/// - Handle non-tied dst+src2 cases. We need to try to find a copy from an -/// AGPR from src2, or reassign src2 to an available AGPR (which should work -/// in the common case of a load). -/// -/// - Handle multiple MFMA uses of the same register. e.g. chained MFMAs that -/// can be rewritten as a set +/// - Handle SplitKit partial copy bundles, and not just full copy instructions /// /// - Update LiveIntervals incrementally instead of recomputing from scratch /// @@ -49,13 +44,18 @@ class AMDGPURewriteAGPRCopyMFMAImpl { VirtRegMap &VRM; LiveRegMatrix &LRM; LiveIntervals &LIS; + const RegisterClassInfo &RegClassInfo; + + bool attemptReassignmentsToAGPR(SmallSetVector &InterferingRegs, + MCPhysReg PrefPhysReg) const; public: AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, - LiveRegMatrix &LRM, LiveIntervals &LIS) + LiveRegMatrix &LRM, LiveIntervals &LIS, + const RegisterClassInfo &RegClassInfo) : ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), - LIS(LIS) {} + LIS(LIS), RegClassInfo(RegClassInfo) {} bool isRewriteCandidate(const MachineInstr &MI) const { return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1; @@ -64,10 +64,10 @@ class AMDGPURewriteAGPRCopyMFMAImpl { /// Compute the register class constraints based on the uses of \p Reg, /// excluding uses from \p ExceptMI. This should be nearly identical to /// MachineRegisterInfo::recomputeRegClass. - const TargetRegisterClass * - recomputeRegClassExceptRewritable(Register Reg, - const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC) const; + const TargetRegisterClass *recomputeRegClassExceptRewritable( + Register Reg, const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC, + SmallVectorImpl &RewriteCandidates) const; bool run(MachineFunction &MF) const; }; @@ -75,7 +75,8 @@ class AMDGPURewriteAGPRCopyMFMAImpl { const TargetRegisterClass * AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( Register Reg, const TargetRegisterClass *OldRC, - const TargetRegisterClass *NewRC) const { + const TargetRegisterClass *NewRC, + SmallVectorImpl &RewriteCandidates) const { // Accumulate constraints from all uses. for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { @@ -86,8 +87,11 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( // effects of rewrite candidates. It just so happens that we can use either // AGPR or VGPR in src0/src1, so don't bother checking the constraint // effects of the individual operands. - if (isRewriteCandidate(*MI)) + if (isRewriteCandidate(*MI)) { + if (!is_contained(RewriteCandidates, MI)) + RewriteCandidates.push_back(MI); continue; + } unsigned OpNo = &MO - &MI->getOperand(0); NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); @@ -98,6 +102,58 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable( return NewRC; } +/// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a +/// preference to use \p PhysReg first. Returns false if the reassignments +/// cannot be trivially performed. +bool AMDGPURewriteAGPRCopyMFMAImpl::attemptReassignmentsToAGPR( + SmallSetVector &InterferingRegs, MCPhysReg PrefPhysReg) const { + // FIXME: The ordering may matter here, but we're just taking uselistorder + // with the special case of ensuring to process the starting instruction + // first. We probably should extract the priority advisor out of greedy and + // use that ordering. + for (Register InterferingReg : InterferingRegs) { + LiveInterval &ReassignLI = LIS.getInterval(InterferingReg); + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + + MCPhysReg Assignable = AMDGPU::NoRegister; + if (EquivalentAGPRRegClass->contains(PrefPhysReg) && + LRM.checkInterference(ReassignLI, PrefPhysReg) == + LiveRegMatrix::IK_Free) { + // First try to assign to the AGPR we were already copying to. This + // should be the first assignment we attempt. We have to guard + // against the use being a subregister (which doesn't have an exact + // class match). + + // TODO: If this does happen to be a subregister use, we should + // still try to assign to a subregister of the original copy result. + Assignable = PrefPhysReg; + } else { + ArrayRef AllocOrder = + RegClassInfo.getOrder(EquivalentAGPRRegClass); + for (MCPhysReg Reg : AllocOrder) { + if (LRM.checkInterference(ReassignLI, Reg) == LiveRegMatrix::IK_Free) { + Assignable = Reg; + break; + } + } + } + + if (!Assignable) { + LLVM_DEBUG(dbgs() << "Unable to reassign VGPR " + << printReg(InterferingReg, &TRI) + << " to a free AGPR\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Reassigning VGPR " << printReg(InterferingReg, &TRI) + << " to " << printReg(Assignable, &TRI) << '\n'); + LRM.assign(ReassignLI, Assignable); + } + + return true; +} + bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // This only applies on subtargets that have a configurable AGPR vs. VGPR // allocation. @@ -127,7 +183,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { LiveInterval &LI = LIS.getInterval(VReg); - // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); @@ -136,22 +191,27 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { if (!DefMI || !DefMI->isFullCopy()) continue; - Register CopySrcReg = DefMI->getOperand(1).getReg(); - if (!CopySrcReg.isVirtual()) + Register MFMADstReg = DefMI->getOperand(1).getReg(); + if (!MFMADstReg.isVirtual()) continue; - LiveInterval &CopySrcLI = LIS.getInterval(CopySrcReg); + LiveInterval &CopySrcLI = LIS.getInterval(MFMADstReg); LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); - MachineInstr *CopySrcMI = LIS.getInstructionFromIndex(LRQ.valueIn()->def); - if (!CopySrcMI) + MachineInstr *MFMA = LIS.getInstructionFromIndex(LRQ.valueIn()->def); + if (!MFMA) continue; - int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(CopySrcMI->getOpcode()); + int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(MFMA->getOpcode()); if (AGPROp == -1) continue; - MachineOperand *Src2 = - TII.getNamedOperand(*CopySrcMI, AMDGPU::OpName::src2); + MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2); + if (!Src2->isReg()) + continue; + + Register Src2Reg = Src2->getReg(); + if (!Src2Reg.isVirtual()) + continue; // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead // of an AGPR or VGPR subclass, so we can't simply use the result on the @@ -163,28 +223,22 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { << " Dst=[" << printReg(VReg) << " => " << printReg(PhysReg, &TRI) << "], Src2=[" << printReg(Src2->getReg(), &TRI) << " => " - << printReg(Src2PhysReg, &TRI) << "]: " << *CopySrcMI; + << printReg(Src2PhysReg, &TRI) << "]: " << *MFMA; }); - // If the inputs are tied and the same register, we can shortcut and - // directly replace the register. - if (!Src2->isReg() || Src2->getReg() != CopySrcReg || - Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { - LLVM_DEBUG( - dbgs() - << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); - // TODO: Only handles the tied case for now. If the input operand is a - // different register, we need to also reassign it (either by looking - // for a compatible copy-from-AGPR, or by seeing if an available AGPR is - // compatible with all other uses. + const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(Src2->getReg()); + const TargetRegisterClass *NewDstConstraintRC = + TII.getRegClass(TII.get(AGPROp), 0, &TRI, MF); + const TargetRegisterClass *NewSrc2ConstraintRC = NewDstConstraintRC; - // If we can't reassign it, we'd need to introduce a different copy - // which is likely worse than the copy we'd be saving. - continue; - } + assert(NewSrc2ConstraintRC == TII.getRegClass(TII.get(AGPROp), + Src2->getOperandNo(), &TRI, + MF) && + "expected src2 and dst to have same class constraint"); - const TargetRegisterClass *Src2VirtRegRC = - MRI.getRegClass(Src2->getReg()); + const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2Reg); + + SmallVector DstRewriteCandidates; // We've found av = COPY (MFMA), and need to verify that we can trivially // rewrite src2 to use the new AGPR. If we can't trivially replace it, @@ -192,35 +246,129 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // first place, as well as need to assign another register, and need to // figure out where to put them. The live range splitting is smarter than // anything we're doing here, so trust it did something reasonable. - const TargetRegisterClass *Src2ExceptRC = - recomputeRegClassExceptRewritable(Src2->getReg(), Src2VirtRegRC, - VirtRegRC); - if (!Src2ExceptRC) { - LLVM_DEBUG(dbgs() << "Could not recompute the regclass\n"); + const TargetRegisterClass *DstExceptRC = + recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC, + DstRewriteCandidates); + if (!DstExceptRC) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of " + << printReg(MFMADstReg, &TRI) << '\n'); continue; } - const TargetRegisterClass *NewSrc2ConstraintRC = - TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); - // Try to constrain src2 to the replacement instruction candidate's // register class. - const TargetRegisterClass *NewSrc2RC = - TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); - if (!NewSrc2RC) { - LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) + const TargetRegisterClass *NewDstRC = + TRI.getCommonSubClass(DstExceptRC, NewDstConstraintRC); + if (!NewDstRC) { + LLVM_DEBUG(dbgs() << "Other uses of " << printReg(MFMADstReg, &TRI) << " are incompatible with replacement class\n"); continue; } + // If the inputs are tied and the same register, we can shortcut and + // directly replace the register. + if (Src2->getReg() != MFMADstReg || + Src2->getSubReg() != DefMI->getOperand(1).getSubReg()) { + // If src2 and dst are different registers, we need to also reassign the + // input to an available AGPR if it is compatible with all other uses. + // + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + SmallVector Src2RewriteCandidates; + const TargetRegisterClass *Src2ExceptRC = + recomputeRegClassExceptRewritable(Src2Reg, Src2RC, VirtRegRC, + Src2RewriteCandidates); + if (!Src2ExceptRC) { + LLVM_DEBUG(dbgs() << "Could not recompute the regclass of " + << printReg(Src2Reg, &TRI) << '\n'); + continue; + } + + const TargetRegisterClass *NewSrc2RC = + TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); + if (!NewSrc2RC) { + LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2Reg, &TRI) + << " are incompatible with AGPR replacement\n"); + continue; + } + + // It's likely that the MFMA is used in sequence with other MFMAs; if we + // cannot migrate the full use/def chain of MFMAs, we would need to + // introduce intermediate copies somewhere. So we only make the + // transform if all the interfering MFMAs can also be migrated. Collect + // the set of rewritable MFMAs and check if we can assign an AGPR at + // that point. + // + // If any of the MFMAs aren't reassignable, we give up and rollback to + // the original register assignments. + + using RecoloringStack = + SmallVector, 8>; + + SmallSetVector InterferingRegs; + + // Make sure we reassign the MFMA we found the copy from first. We want + // to ensure dst ends up in the physreg we were originally copying to. + InterferingRegs.insert(MFMADstReg); + + RecoloringStack TentativeReassignments; + + for (MachineInstr *RewriteCandidate : Src2RewriteCandidates) { + MachineOperand *CandDst = + TII.getNamedOperand(*RewriteCandidate, AMDGPU::OpName::vdst); + MachineOperand *CandSrc2 = + TII.getNamedOperand(*RewriteCandidate, AMDGPU::OpName::src2); + + InterferingRegs.insert(CandDst->getReg()); + if (CandDst->getReg() != CandSrc2->getReg()) + InterferingRegs.insert(CandSrc2->getReg()); + } + + for (Register InterferingReg : InterferingRegs) { + LiveInterval &LI = LIS.getInterval(InterferingReg); + TentativeReassignments.push_back({&LI, VRM.getPhys(InterferingReg)}); + LRM.unassign(LI); + } + + if (!attemptReassignmentsToAGPR(InterferingRegs, PhysReg)) { + // Roll back the register assignments to the original state. + for (auto [LI, OldAssign] : TentativeReassignments) { + if (VRM.hasPhys(LI->reg())) + LRM.unassign(*LI); + LRM.assign(*LI, OldAssign); + } + + continue; + } + + // Fixup the register classes of the virtual registers now that we've + // committed to the reassignments. + for (Register InterferingReg : InterferingRegs) { + const TargetRegisterClass *EquivalentAGPRRegClass = + TRI.getEquivalentAGPRClass(MRI.getRegClass(InterferingReg)); + MRI.setRegClass(InterferingReg, EquivalentAGPRRegClass); + } + + for (MachineInstr *RewriteCandidate : Src2RewriteCandidates) { + int NewMFMAOp = + AMDGPU::getMFMASrcCVDstAGPROp(RewriteCandidate->getOpcode()); + RewriteCandidate->setDesc(TII.get(NewMFMAOp)); + } + + // We likely left an identity copy behind after assignment; let + // VirtRegRewriter deal with it later. + MadeChange = true; + continue; + } + MRI.setRegClass(VReg, AssignedRC); - MRI.setRegClass(Src2->getReg(), NewSrc2RC); + MRI.setRegClass(MFMADstReg, NewDstRC); - CopySrcMI->setDesc(TII.get(AGPROp)); + MFMA->setDesc(TII.get(AGPROp)); // Perform replacement of the register, rewriting the rewritable uses. for (MachineInstr &UseMI : - make_early_inc_range(MRI.reg_instructions(CopySrcReg))) { + make_early_inc_range(MRI.reg_instructions(MFMADstReg))) { if (TII.isMAI(UseMI)) { // Note the register we need to rewrite may still appear in src0/src1, // but that's fine since those can use A or V anyway. @@ -229,10 +377,10 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { UseMI.setDesc(TII.get(ReplacementOp)); } - UseMI.substituteRegister(CopySrcReg, VReg, AMDGPU::NoSubRegister, TRI); + UseMI.substituteRegister(MFMADstReg, VReg, AMDGPU::NoSubRegister, TRI); } - LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); + LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *MFMA); // We left behind an identity copy, so delete it. LIS.RemoveMachineInstrFromMaps(*DefMI); @@ -243,7 +391,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // We don't need the liveness information anymore, so don't bother // updating the intervals. Just delete the stale information. // TODO: Is it worth preserving these? - LIS.removeInterval(CopySrcReg); + LIS.removeInterval(MFMADstReg); LIS.removeInterval(VReg); LIS.createAndComputeVirtRegInterval(VReg); @@ -257,6 +405,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass { public: static char ID; + RegisterClassInfo RegClassInfo; AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) { initializeAMDGPURewriteAGPRCopyMFMALegacyPass( @@ -302,11 +451,13 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction( if (skipFunction(MF.getFunction())) return false; + RegClassInfo.runOnMachineFunction(MF); + auto &VRM = getAnalysis().getVRM(); auto &LRM = getAnalysis().getLRM(); auto &LIS = getAnalysis().getLIS(); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo); return Impl.run(MF); } @@ -316,8 +467,10 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, VirtRegMap &VRM = MFAM.getResult(MF); LiveRegMatrix &LRM = MFAM.getResult(MF); LiveIntervals &LIS = MFAM.getResult(MF); + RegisterClassInfo RegClassInfo; + RegClassInfo.runOnMachineFunction(MF); - AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, RegClassInfo); if (!Impl.run(MF)) return PreservedAnalyses::all(); auto PA = getMachineFunctionPassPreservedAnalyses(); diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index c3f6af3854eeb..8cce41360ab82 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -209,15 +209,14 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr2_vgpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -470,18 +469,17 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr16_vgpr17 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -557,18 +555,17 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -645,16 +642,15 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr4_vgpr5, $vgpr8_vgpr9, undef $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 @@ -731,16 +727,15 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr18_vgpr19 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_16X16X16F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr16_agpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_16X16X16F16_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, killed $agpr2_agpr3_agpr4_agpr5, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23