diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 71dd99c0d7a53..2111b8a8763d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -538,6 +538,17 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +class AMDGPURewriteAGPRCopyMFMAPass + : public PassInfoMixin { +public: + AMDGPURewriteAGPRCopyMFMAPass() = default; + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +void initializeAMDGPURewriteAGPRCopyMFMALegacyPass(PassRegistry &); +extern char &AMDGPURewriteAGPRCopyMFMALegacyID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 13453963eec6d..b61216c5e5e92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -102,6 +102,7 @@ MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass()) +MACHINE_FUNCTION_PASS("amdgpu-rewrite-agpr-copy-mfma", AMDGPURewriteAGPRCopyMFMAPass()) MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp new file mode 100644 index 0000000000000..a8e1967116d19 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -0,0 +1,304 @@ +//===-- AMDGPURewriteAGPRCopyMFMA.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file \brief Try to replace MFMA instructions using VGPRs with MFMA +/// instructions using AGPRs. We expect MFMAs to be selected using VGPRs, and +/// only use AGPRs if it helps avoid spilling. In this case, the MFMA will have +/// copies between AGPRs and VGPRs and the AGPR variant of an MFMA pseudo. This +/// pass will attempt to delete the cross register bank copy and replace the +/// MFMA opcode. +/// +/// TODO: +/// - Handle non-tied dst+src2 cases. We need to try to find a copy from an +/// AGPR from src2, or reassign src2 to an available AGPR (which should work +/// in the common case of a load). +/// +/// - Handle multiple MFMA uses of the same register. e.g. chained MFMAs that +/// can be rewritten as a set +/// +/// - Update LiveIntervals incrementally instead of recomputing from scratch +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-rewrite-agpr-copy-mfma" + +namespace { + +class AMDGPURewriteAGPRCopyMFMAImpl { + const GCNSubtarget &ST; + const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + MachineRegisterInfo &MRI; + VirtRegMap &VRM; + LiveRegMatrix &LRM; + LiveIntervals &LIS; + +public: + AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS) + : ST(MF.getSubtarget()), TII(*ST.getInstrInfo()), + TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM), + LIS(LIS) {} + + /// Compute the register class constraints based on the uses of \p Reg, + /// excluding uses from \p ExceptMI. This should be nearly identical to + /// MachineRegisterInfo::recomputeRegClass. + const TargetRegisterClass * + recomputeRegClassExcept(Register Reg, const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC, + const MachineInstr *ExceptMI) const; + + bool run(MachineFunction &MF) const; +}; + +const TargetRegisterClass * +AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExcept( + Register Reg, const TargetRegisterClass *OldRC, + const TargetRegisterClass *NewRC, const MachineInstr *ExceptMI) const { + + // Accumulate constraints from all uses. + for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) { + // Apply the effect of the given operand to NewRC. + MachineInstr *MI = MO.getParent(); + if (MI == ExceptMI) + continue; + + unsigned OpNo = &MO - &MI->getOperand(0); + NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI); + if (!NewRC || NewRC == OldRC) + return nullptr; + } + + return NewRC; +} + +bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { + // This only applies on subtargets that have a configurable AGPR vs. VGPR + // allocation. + if (!ST.hasGFX90AInsts()) + return false; + + // Early exit if no AGPRs were assigned. + if (!LRM.isPhysRegUsed(AMDGPU::AGPR0)) + return false; + + bool MadeChange = false; + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + Register VReg = Register::index2VirtReg(I); + Register PhysReg = VRM.getPhys(VReg); + if (!PhysReg) + continue; + + // Find AV_* registers assigned to AGPRs. + const TargetRegisterClass *VirtRegRC = MRI.getRegClass(VReg); + if (!TRI.isVectorSuperClass(VirtRegRC)) + continue; + + const TargetRegisterClass *AssignedRC = TRI.getPhysRegBaseClass(PhysReg); + if (!TRI.isAGPRClass(AssignedRC)) + continue; + + LiveInterval &LI = LIS.getInterval(VReg); + + // TODO: Test multiple uses + for (VNInfo *VNI : LI.vnis()) { + MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); + + // TODO: Handle SplitKit produced copy bundles for partially defined + // registers. + if (!DefMI || !DefMI->isFullCopy()) + continue; + + Register CopySrcReg = DefMI->getOperand(1).getReg(); + if (!CopySrcReg.isVirtual()) + continue; + + LiveInterval &CopySrcLI = LIS.getInterval(CopySrcReg); + LiveQueryResult LRQ = CopySrcLI.Query(VNI->def.getRegSlot()); + MachineInstr *CopySrcMI = LIS.getInstructionFromIndex(LRQ.valueIn()->def); + if (!CopySrcMI) + continue; + + int AGPROp = AMDGPU::getMFMASrcCVDstAGPROp(CopySrcMI->getOpcode()); + if (AGPROp == -1) + continue; + + MachineOperand *Src2 = + TII.getNamedOperand(*CopySrcMI, AMDGPU::OpName::src2); + + // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead + // of an AGPR or VGPR subclass, so we can't simply use the result on the + // assignment. + + LLVM_DEBUG({ + Register Src2PhysReg = VRM.getPhys(Src2->getReg()); + dbgs() << "Attempting to replace VGPR MFMA with AGPR version:" + << " Dst=[" << printReg(VReg) << " => " + << printReg(PhysReg, &TRI) << "], Src2=[" + << printReg(Src2->getReg(), &TRI) << " => " + << printReg(Src2PhysReg, &TRI) << "]: " << *CopySrcMI; + }); + + // If the inputs are tied and the same register, we can shortcut and + // directly replace the register. + if (Src2->getReg() != CopySrcReg) { + LLVM_DEBUG( + dbgs() + << "Replacing untied VGPR MFMAs with AGPR form not yet handled\n"); + // TODO: Only handles the tied case for now. If the input operand is a + // different register, we need to also reassign it (either by looking + // for a compatible copy-from-AGPR, or by seeing if an available AGPR is + // compatible with all other uses. + + // If we can't reassign it, we'd need to introduce a different copy + // which is likely worse than the copy we'd be saving. + continue; + } + + const TargetRegisterClass *Src2VirtRegRC = + MRI.getRegClass(Src2->getReg()); + + // We've found av = COPY (MFMA), and need to verify that we can trivially + // rewrite src2 to use the new AGPR. If we can't trivially replace it, + // we're going to induce as many copies as we would have emitted in the + // first place, as well as need to assign another register, and need to + // figure out where to put them. The live range splitting is smarter than + // anything we're doing here, so trust it did something reasonable. + const TargetRegisterClass *Src2ExceptRC = recomputeRegClassExcept( + Src2->getReg(), Src2VirtRegRC, VirtRegRC, CopySrcMI); + if (!Src2ExceptRC) + continue; + + const TargetRegisterClass *NewSrc2ConstraintRC = + TII.getRegClass(TII.get(AGPROp), Src2->getOperandNo(), &TRI, MF); + + // Try to constrain src2 to the replacement instruction candidate's + // register class. + const TargetRegisterClass *NewSrc2RC = + TRI.getCommonSubClass(Src2ExceptRC, NewSrc2ConstraintRC); + if (!NewSrc2RC) { + // TODO: This is ignoring ther rewritable uses. e.g. a rewritable MFMA + // using a rewritable MFMA can be rewritten as a pair. + LLVM_DEBUG(dbgs() << "Other uses of " << printReg(Src2->getReg(), &TRI) + << " are incompatible with replacement class\n"); + continue; + } + + MRI.setRegClass(VReg, AssignedRC); + MRI.setRegClass(Src2->getReg(), NewSrc2RC); + + CopySrcMI->setDesc(TII.get(AGPROp)); + + // TODO: Is replacing too aggressive, fixup these instructions only? + MRI.replaceRegWith(CopySrcReg, VReg); + + LLVM_DEBUG(dbgs() << "Replaced VGPR MFMA with AGPR: " << *CopySrcMI); + + // We left behind an identity copy, so delete it. + LIS.RemoveMachineInstrFromMaps(*DefMI); + DefMI->eraseFromParent(); + + LRM.unassign(CopySrcLI); + + // We don't need the liveness information anymore, so don't bother + // updating the intervals. Just delete the stale information. + // TODO: Is it worth preserving these? + LIS.removeInterval(CopySrcReg); + LIS.removeInterval(VReg); + LIS.createAndComputeVirtRegInterval(VReg); + + MadeChange = true; + } + } + + return MadeChange; +} + +class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPURewriteAGPRCopyMFMALegacy() : MachineFunctionPass(ID) { + initializeAMDGPURewriteAGPRCopyMFMALegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AMDGPU Rewrite AGPR-Copy-MFMA"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE, + "AMDGPU Rewrite AGPR-Copy-MFMA", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) +INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE, + "AMDGPU Rewrite AGPR-Copy-MFMA", false, false) + +char AMDGPURewriteAGPRCopyMFMALegacy::ID = 0; + +char &llvm::AMDGPURewriteAGPRCopyMFMALegacyID = + AMDGPURewriteAGPRCopyMFMALegacy::ID; + +bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction( + MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + auto &VRM = getAnalysis().getVRM(); + auto &LRM = getAnalysis().getLRM(); + auto &LIS = getAnalysis().getLIS(); + + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + return Impl.run(MF); +} + +PreservedAnalyses +AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + VirtRegMap &VRM = MFAM.getResult(MF); + LiveRegMatrix &LRM = MFAM.getResult(MF); + LiveIntervals &LIS = MFAM.getResult(MF); + + AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS); + if (!Impl.run(MF)) + return PreservedAnalyses::all(); + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index d2e4825cf3c81..c3536113e9bef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -534,6 +534,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerModuleLDSLegacyPass(*PR); initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPUReserveWWMRegsLegacyPass(*PR); + initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); @@ -1577,6 +1578,8 @@ void GCNPassConfig::addOptimizedRegAlloc() { bool GCNPassConfig::addPreRewrite() { if (EnableRegReassign) addPass(&GCNNSAReassignID); + + addPass(&AMDGPURewriteAGPRCopyMFMALegacyID); return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 928a5001e0c98..e3519f192137c 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -101,6 +101,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPURemoveIncompatibleFunctions.cpp AMDGPUReserveWWMRegs.cpp AMDGPUResourceUsageAnalysis.cpp + AMDGPURewriteAGPRCopyMFMA.cpp AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp AMDGPUSelectionDAGInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index a4b135d5e0b59..06a7a17b0246b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -237,6 +237,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { return isSGPRClass(getPhysRegBaseClass(Reg)); } + bool isVGPRPhysReg(Register Reg) const { + return isVGPRClass(getPhysRegBaseClass(Reg)); + } + /// \returns true if this class contains only VGPR registers static bool isVGPRClass(const TargetRegisterClass *RC) { return hasVGPRs(RC) && !hasAGPRs(RC) && !hasSGPRs(RC); diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 171df9343b05d..665d63476372f 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -30,15 +30,14 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 @@ -117,19 +116,18 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $vcc, $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 8, 0, implicit $exec :: (load (s64), addrspace 1) - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8 = GLOBAL_LOAD_DWORDX3 undef renamable $vgpr0_vgpr1, 16, 0, implicit $exec :: (load (s96), align 16, addrspace 1) - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 28, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 - ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR undef renamable $vgpr0, renamable $vgpr6_vgpr7_vgpr8_vgpr9, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: renamable $agpr0_agpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr2_agpr3 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 8, 0, implicit $exec :: (load (s64), addrspace 1) + ; CHECK-NEXT: renamable $agpr4_agpr5_agpr6 = GLOBAL_LOAD_DWORDX3 undef renamable $vgpr0_vgpr1, 16, 0, implicit $exec :: (load (s96), align 16, addrspace 1) + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 28, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR undef renamable $vgpr0, renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17:0x00000000FFFFFFFF + ; CHECK-NEXT: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15:0x00000000FFFFFFFF ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index dd2ff2e013cc8..af3241e95e91d 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -376,6 +376,7 @@ ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: Greedy Register Allocator ; GCN-O1-NEXT: GCN NSA Reassign +; GCN-O1-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O1-NEXT: Stack Slot Coloring @@ -686,6 +687,7 @@ ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: Greedy Register Allocator ; GCN-O1-OPTS-NEXT: GCN NSA Reassign +; GCN-O1-OPTS-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -1002,6 +1004,7 @@ ; GCN-O2-NEXT: Live Register Matrix ; GCN-O2-NEXT: Greedy Register Allocator ; GCN-O2-NEXT: GCN NSA Reassign +; GCN-O2-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O2-NEXT: Stack Slot Coloring @@ -1331,6 +1334,7 @@ ; GCN-O3-NEXT: Live Register Matrix ; GCN-O3-NEXT: Greedy Register Allocator ; GCN-O3-NEXT: GCN NSA Reassign +; GCN-O3-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O3-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll index 33585024d81dd..ea6449b99b516 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -31,6 +31,7 @@ ; DEFAULT-NEXT: Live Register Matrix ; DEFAULT-NEXT: Greedy Register Allocator ; DEFAULT-NEXT: GCN NSA Reassign +; DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load ; DEFAULT-NEXT: Stack Slot Coloring @@ -77,6 +78,7 @@ ; BASIC-DEFAULT-NEXT: Live Register Matrix ; BASIC-DEFAULT-NEXT: Greedy Register Allocator ; BASIC-DEFAULT-NEXT: GCN NSA Reassign +; BASIC-DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load ; BASIC-DEFAULT-NEXT: Stack Slot Coloring @@ -99,6 +101,7 @@ ; DEFAULT-BASIC-NEXT: Live Register Matrix ; DEFAULT-BASIC-NEXT: Basic Register Allocator ; DEFAULT-BASIC-NEXT: GCN NSA Reassign +; DEFAULT-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load ; DEFAULT-BASIC-NEXT: Stack Slot Coloring @@ -127,6 +130,7 @@ ; BASIC-BASIC-NEXT: Live Register Matrix ; BASIC-BASIC-NEXT: Basic Register Allocator ; BASIC-BASIC-NEXT: GCN NSA Reassign +; BASIC-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load ; BASIC-BASIC-NEXT: Stack Slot Coloring diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 628e544a687e5..d4adeddd9b4e4 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -188,6 +188,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPURemoveIncompatibleFunctions.cpp", "AMDGPUReserveWWMRegs.cpp", "AMDGPUResourceUsageAnalysis.cpp", + "AMDGPURewriteAGPRCopyMFMA.cpp", "AMDGPURewriteOutArguments.cpp", "AMDGPURewriteUndefForPHI.cpp", "AMDGPUSelectionDAGInfo.cpp",