diff --git a/llvm/lib/Target/AIE/AIE.h b/llvm/lib/Target/AIE/AIE.h index 622acd89b7ae..39138cf1b82a 100644 --- a/llvm/lib/Target/AIE/AIE.h +++ b/llvm/lib/Target/AIE/AIE.h @@ -60,6 +60,7 @@ MachineFunctionPass *createAIEEliminateDuplicatePHI(); FunctionPass *createAIEOutlineMemoryGEP(); FunctionPass *createAIESuperRegRewriter(); FunctionPass *createAIEWawRegRewriter(); +FunctionPass *createAIEUnallocatedSuperRegRewriter(); FunctionPass *createAIEPostSelectOptimize(); MachineFunctionPass * createDeadMachineInstructionElim(bool KeepLifetimeInstructions); @@ -84,6 +85,8 @@ extern char &AIESuperRegRewriterID; void initializeAIESuperRegRewriterPass(PassRegistry &); extern char &AIEWawRegRewriterID; void initializeAIEWawRegRewriterPass(PassRegistry &); +extern char &AIEUnallocatedSuperRegRewriterID; +void initializeAIEUnallocatedSuperRegRewriterPass(PassRegistry &); extern char &AIEOutlineMemoryGEPID; void initializeAIEOutlineMemoryGEPPass(PassRegistry &); diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index 1e2d1f19110c..0b572e3a14fc 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -96,6 +96,11 @@ cl::opt EnableStagedRA("aie-staged-ra", cl::Hidden, cl::init(true), cl::desc("Enable multi-stage register allocation")); +cl::opt EnableFineGrainedStagedRA( + "aie-staged-ra-fine-grained-alloc", cl::Hidden, cl::init(true), + cl::desc("Enable multi-stage register allocation with fine-grained " + "selection of live intervals")); + cl::opt EnableWAWRegRewrite("aie-wawreg-rewrite", cl::desc("Enable the WAW Register Renaming in loops"), @@ -153,6 +158,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAIETarget() { initializeAIEPseudoBranchExpansionPass(*PR); initializeAIESubRegConstrainerPass(*PR); initializeAIESuperRegRewriterPass(*PR); + initializeAIEUnallocatedSuperRegRewriterPass(*PR); initializeAIEWawRegRewriterPass(*PR); initializeAIEOutlineMemoryGEPPass(*PR); initializeAIEFinalizeBundlePass(*PR); diff --git a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp index 433aef018cbe..44e84037df91 100644 --- a/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIESuperRegRewriter.cpp @@ -4,12 +4,13 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// #include "AIEBaseInstrInfo.h" #include "AIEBaseRegisterInfo.h" +#include "AIESuperRegUtils.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallSet.h" @@ -63,80 +64,8 @@ class AIESuperRegRewriter : public MachineFunctionPass { } bool runOnMachineFunction(MachineFunction &Fn) override; - -private: - void rewriteSuperReg(Register Reg, Register AssignedPhysReg, - MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, - VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, - SlotIndexes &Indexes, LiveDebugVariables &DebugVars); }; -/// Returns the subreg indices that can be used to rewrite \p Reg into smaller -/// regs. Returns {} if the rewrite isn't possible. -static SmallSet getRewritableSubRegs(Register Reg, - const MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI, - std::set &VisitedVRegs) { - if (Reg.isPhysical()) { - // TODO: One could use collectSubRegs() in AIEBaseInstrInfo.cpp - // But given that MOD registers are not part of the ABI, they should - // not appear as physical registers before RA. - LLVM_DEBUG(dbgs() << " Cannot rewrite physreg " << printReg(Reg, &TRI) - << "\n"); - return {}; - } - - auto &SubRegSplit = TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()); - if (SubRegSplit.size() <= 1) { - // Register does not have multiple subregs to be rewritten into. - LLVM_DEBUG(dbgs() << " Cannot rewrite " << printReg(Reg, &TRI, 0, &MRI) - << ": no sub-reg split\n"); - return {}; - } - - VisitedVRegs.insert(Reg); - SmallSet UsedSubRegs; - for (MachineOperand &RegOp : MRI.reg_operands(Reg)) { - int SubReg = RegOp.getSubReg(); - if (SubReg && SubRegSplit.count(SubReg)) { - UsedSubRegs.insert(SubReg); - } else if (RegOp.getParent()->isFullCopy()) { - // To rewrite a full copy, both operands need to be rewritable using - // their subregs. - Register DstReg = RegOp.getParent()->getOperand(0).getReg(); - if (!VisitedVRegs.count(DstReg) && - getRewritableSubRegs(DstReg, MRI, TRI, VisitedVRegs).empty()) { - LLVM_DEBUG(dbgs() << " Cannot rewrite " - << printReg(DstReg, &TRI, 0, &MRI) << " in " - << *RegOp.getParent()); - return {}; - } - Register SrcReg = RegOp.getParent()->getOperand(1).getReg(); - if (!VisitedVRegs.count(SrcReg) && - getRewritableSubRegs(SrcReg, MRI, TRI, VisitedVRegs).empty()) { - LLVM_DEBUG(dbgs() << " Cannot rewrite " - << printReg(SrcReg, &TRI, 0, &MRI) << " in " - << *RegOp.getParent()); - return {}; - } - UsedSubRegs.insert(SubRegSplit.begin(), SubRegSplit.end()); - } else { - LLVM_DEBUG(dbgs() << " Cannot rewrite " << RegOp << " in " - << *RegOp.getParent()); - return {}; - } - } - - return UsedSubRegs; -} - -static SmallSet getRewritableSubRegs(Register Reg, - const MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI) { - std::set VisitedVRegs; - return getRewritableSubRegs(Reg, MRI, TRI, VisitedVRegs); -} - bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(llvm::dbgs() << "*** Splitting super-registers: " << MF.getName() << " ***\n"); @@ -149,10 +78,11 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LiveIntervals &LIS = getAnalysis().getLIS(); SlotIndexes &Indexes = getAnalysis().getSI(); LiveDebugVariables &DebugVars = getAnalysis().getLDV(); - std::map AssignedPhysRegs; + std::map>> AssignedPhysRegs; // Collect already-assigned VRegs that can be split into smaller ones. LLVM_DEBUG(VRM.dump()); + LLVM_DEBUG(LIS.dump()); for (unsigned VRegIdx = 0, End = MRI.getNumVirtRegs(); VRegIdx != End; ++VRegIdx) { Register Reg = Register::index2VirtReg(VRegIdx); @@ -172,8 +102,11 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Analysing " << printReg(Reg, &TRI, 0, &MRI) << ":" << printRegClassOrBank(Reg, MRI, &TRI) << '\n'); - if (!getRewritableSubRegs(Reg, MRI, TRI).empty()) { - AssignedPhysRegs[Reg] = VRM.getPhys(Reg); + SmallSet RewritableSubRegs = + AIESuperRegUtils::getRewritableSubRegs(Reg, MRI, TRI); + if (!RewritableSubRegs.empty()) { + AssignedPhysRegs[Reg] = + std::make_pair(VRM.getPhys(Reg), RewritableSubRegs); LRM.unassign(LIS.getInterval(Reg)); } else { LLVM_DEBUG(dbgs() << "Could not rewrite " << printReg(Reg, &TRI, 0, &MRI) @@ -182,138 +115,17 @@ bool AIESuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { } // Re-write all the collected VRegs - for (auto &[VReg, PhysReg] : AssignedPhysRegs) { - rewriteSuperReg(VReg, PhysReg, MRI, TRI, VRM, LRM, LIS, Indexes, DebugVars); + for (auto &[VReg, PhysRegAndSubRegs] : AssignedPhysRegs) { + const Register PhysReg = PhysRegAndSubRegs.first; + SmallSet &SubRegs = PhysRegAndSubRegs.second; + AIESuperRegUtils::rewriteSuperReg(VReg, PhysReg, SubRegs, MRI, TRI, VRM, + LRM, LIS, Indexes, DebugVars); } LLVM_DEBUG(VRM.dump()); return !AssignedPhysRegs.empty(); } -/// Return a mask of all the lanes that are live at \p Index -static LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, - const LiveIntervals &LIS) { - const LiveInterval &LI = LIS.getInterval(Reg); - if (!LI.hasSubRanges()) - return LaneBitmask::getAll(); - - LaneBitmask LiveLanes; - for (const LiveInterval::SubRange &SubLI : LI.subranges()) { - if (SubLI.liveAt(Index)) - LiveLanes |= SubLI.LaneMask; - } - return LiveLanes; -} - -/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs -static void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, - LiveIntervals &LIS, const TargetInstrInfo &TII, - const TargetRegisterInfo &TRI) { - assert(MI.isFullCopy()); - SlotIndex CopyIndex = LIS.getInstructionIndex(MI); - LLVM_DEBUG(dbgs() << " Changing full copy at " << CopyIndex << ": " << MI); - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); - - LIS.removeVRegDefAt(LIS.getInterval(DstReg), CopyIndex.getRegSlot()); - for (int SubRegIdx : CopySubRegs) { - if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { - LLVM_DEBUG(dbgs() << " Skip undef subreg " - << TRI.getSubRegIndexName(SubRegIdx) << "\n"); - continue; - } - - MachineInstr *PartCopy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - TII.get(TargetOpcode::COPY)) - .addReg(DstReg, RegState::Define, SubRegIdx) - .addReg(SrcReg, 0, SubRegIdx) - .getInstr(); - LLVM_DEBUG(dbgs() << " to " << *PartCopy); - LIS.InsertMachineInstrInMaps(*PartCopy); - LIS.getInterval(PartCopy->getOperand(0).getReg()); - } - - LIS.RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); -} - -void AIESuperRegRewriter::rewriteSuperReg( - Register Reg, Register AssignedPhysReg, MachineRegisterInfo &MRI, - const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, LiveRegMatrix &LRM, - LiveIntervals &LIS, SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { - LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI) << '\n'); - auto *TII = static_cast( - VRM.getMachineFunction().getSubtarget().getInstrInfo()); - - // Collect all the subreg indices to rewrite as independent vregs. - SmallMapVector SubRegToVReg; - const TargetRegisterClass *SuperRC = MRI.getRegClass(Reg); - SmallSet SubRegs = getRewritableSubRegs(Reg, MRI, TRI); - assert(!SubRegs.empty()); - for (int SubReg : SubRegs) { - const TargetRegisterClass *SubRC = TRI.getSubRegisterClass(SuperRC, SubReg); - SubRegToVReg[SubReg] = MRI.createVirtualRegister(SubRC); - } - - // Rewrite full copies into multiple copies using subregs - for (MachineInstr &MI : make_early_inc_range(MRI.reg_instructions(Reg))) { - if (MI.isFullCopy()) - rewriteFullCopy(MI, TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()), - LIS, *TII, TRI); - } - - LLVM_DEBUG(dbgs() << " Splitting range " << LIS.getInterval(Reg) << "\n"); - for (MachineOperand &RegOp : make_early_inc_range(MRI.reg_operands(Reg))) { - LLVM_DEBUG(dbgs() << " Changing " << *RegOp.getParent()); - int SubReg = RegOp.getSubReg(); - assert(SubReg); - RegOp.setReg(SubRegToVReg[SubReg]); - RegOp.setSubReg(0); - - // There might have been a write-undef due to only writing one sub-lane. - // Now that each sub-lane has its own VReg, the qualifier is invalid. - if (RegOp.isDef()) - RegOp.setIsUndef(false); - - // Make sure the right reg class is applied, some MIs might use compound - // classes with both 20 and 32 bits registers. - const TargetRegisterClass *OpRC = TII->getRegClass( - RegOp.getParent()->getDesc(), RegOp.getParent()->getOperandNo(&RegOp), - &TRI, VRM.getMachineFunction()); - MRI.constrainRegClass(SubRegToVReg[SubReg], OpRC); - - LLVM_DEBUG(dbgs() << " to " << *RegOp.getParent()); - } - - VRM.grow(); - LIS.removeInterval(Reg); - - for (auto &[SubRegIdx, VReg] : SubRegToVReg) { - MCRegister SubPhysReg = TRI.getSubReg(AssignedPhysReg, SubRegIdx); - LiveInterval &SubRegLI = LIS.getInterval(VReg); - LLVM_DEBUG(dbgs() << " Assigning Range: " << SubRegLI << '\n'); - - // By giving an independent VReg to each lane, we might have created - // multiple separate components. Give a VReg to each separate component. - SmallVector LIComponents; - LIS.splitSeparateComponents(SubRegLI, LIComponents); - LIComponents.push_back(&SubRegLI); - VRM.grow(); - - for (LiveInterval *LI : LIComponents) { - LRM.assign(*LI, SubPhysReg); - VRM.setRequiredPhys(LI->reg(), SubPhysReg); - LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); - } - } - - // Announce new VRegs so DBG locations can be updated. - auto NewVRegs = SmallVector(llvm::map_range( - SubRegToVReg, [&](auto &Mapping) { return Mapping.second; })); - DebugVars.splitRegister(Reg, NewVRegs, LIS); -} - } // end anonymous namespace char AIESuperRegRewriter::ID = 0; diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.cpp b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp new file mode 100644 index 000000000000..9ae12b80e329 --- /dev/null +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.cpp @@ -0,0 +1,321 @@ +//===- AIESuperRegUtils.cpp -----------------------------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +#include "AIESuperRegUtils.h" +#include "AIEBaseInstrInfo.h" +#include "AIEBaseRegisterInfo.h" +#include "llvm/CodeGen/LiveDebugVariables.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "aie-ra" + +namespace llvm::AIESuperRegUtils { + +/// Returns the subreg indices that can be used to rewrite \p Reg into smaller +/// regs. Returns {} if the rewrite isn't possible. +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + std::set &VisitedVRegs) { + if (Reg.isPhysical()) { + // TODO: One could use collectSubRegs() in AIEBaseInstrInfo.cpp + // But given that MOD registers are not part of the ABI, they should + // not appear as physical registers before RA. + LLVM_DEBUG(dbgs() << " Cannot rewrite physreg " << printReg(Reg, &TRI) + << "\n"); + return {}; + } + + auto &SubRegSplit = TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()); + if (SubRegSplit.size() <= 1) { + // Register does not have multiple subregs to be rewritten into. + LLVM_DEBUG(dbgs() << " Cannot rewrite " << printReg(Reg, &TRI, 0, &MRI) + << ": no sub-reg split\n"); + return {}; + } + + VisitedVRegs.insert(Reg); + SmallSet UsedSubRegs; + for (MachineOperand &RegOp : MRI.reg_operands(Reg)) { + int SubReg = RegOp.getSubReg(); + if (SubReg && SubRegSplit.count(SubReg)) { + UsedSubRegs.insert(SubReg); + } else if (RegOp.getParent()->isFullCopy()) { + // To rewrite a full copy, both operands need to be rewritable using + // their subregs. + Register DstReg = RegOp.getParent()->getOperand(0).getReg(); + if (!VisitedVRegs.count(DstReg) && + getRewritableSubRegs(DstReg, MRI, TRI, VisitedVRegs).empty()) { + LLVM_DEBUG(dbgs() << " Cannot rewrite " + << printReg(DstReg, &TRI, 0, &MRI) << " in " + << *RegOp.getParent()); + return {}; + } + Register SrcReg = RegOp.getParent()->getOperand(1).getReg(); + if (!VisitedVRegs.count(SrcReg) && + getRewritableSubRegs(SrcReg, MRI, TRI, VisitedVRegs).empty()) { + LLVM_DEBUG(dbgs() << " Cannot rewrite " + << printReg(SrcReg, &TRI, 0, &MRI) << " in " + << *RegOp.getParent()); + return {}; + } + UsedSubRegs.insert(SubRegSplit.begin(), SubRegSplit.end()); + } else { + LLVM_DEBUG(dbgs() << " Cannot rewrite " << RegOp << " in " + << *RegOp.getParent()); + return {}; + } + } + + return UsedSubRegs; +} + +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI) { + std::set VisitedVRegs; + return getRewritableSubRegs(Reg, MRI, TRI, VisitedVRegs); +} + +/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs +void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, + LiveIntervals &LIS, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM) { + assert(MI.isFullCopy()); + SlotIndex CopyIndex = LIS.getInstructionIndex(MI); + LLVM_DEBUG(dbgs() << " Changing full copy at " << CopyIndex << ": " << MI); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LaneBitmask LiveSrcLanes = getLiveLanesAt(CopyIndex, SrcReg, LIS); + + if (!VRM.hasPhys(DstReg)) { + // FIXME: This pass may cause verification failures. The fix should + // be in the MachineVerifier. This is a very uncommon case where the + // destination register was not allocated yet. + // The machine verifier does not properly handle the semantics of: + // 1. **Partial register definitions with `undefined`**: When the first + // subregister is defined with `undefined`, it doesn't expect subsequent + // definitions to implicitly read that lane. + // 2. **Lane-based liveness for composite registers**: The verifier expects + // a continuous live range for the entire register, but with subregister + // definitions, different lanes have different live ranges that are being + // built up incrementally. + // 3. **Implicit reads in partial definitions**: The verifier doesn't + // recognize that `%18.sub_dim_size:ed = COPY ...` implicitly reads the + // previously defined `%18.sub_dim_count` lane. + MI.getMF()->getProperties().set( + MachineFunctionProperties::Property::FailsVerification); + } + + MachineInstr *FirstMI = nullptr; + SmallSet RegistersToRepair; + for (int SubRegIdx : CopySubRegs) { + if ((LiveSrcLanes & TRI.getSubRegIndexLaneMask(SubRegIdx)).none()) { + LLVM_DEBUG(dbgs() << " Skip undefined subreg " + << TRI.getSubRegIndexName(SubRegIdx) << "\n"); + continue; + } + + MachineInstr *PartCopy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII.get(TargetOpcode::COPY)) + .addReg(DstReg, RegState::Define, SubRegIdx) + .addReg(SrcReg, 0, SubRegIdx) + .getInstr(); + + // Only set undefined on the first partial copy. The first copy doesn't read + // other lanes, but subsequent copies do read the previously written lanes. + // Setting undefined on all copies breaks live interval tracking and causes + // machine verifier errors. + if (!FirstMI) { + PartCopy->getOperand(0).setIsUndef(); + FirstMI = PartCopy; + } + LLVM_DEBUG(dbgs() << " to " << *PartCopy); + LIS.InsertMachineInstrInMaps(*PartCopy); + // We need to repair only the Src register. For the Dst register, + // we don't need to do anything explicit, because we will replace the + // original copy by the first lane copy in LIS. We avoid the explicit repair + // of Dst reg because LIS will create a exclusive range for each copy, + // because it considers that every sub-lane copy will make the preceding + // one dead, what is not true for composite registers. + // TODO: investigate why subregister liveness is being ignored by LIS + // at this point. + RegistersToRepair.insert(PartCopy->getOperand(1).getReg()); + } + + // Replace the original copy by the first one, so we automatically repair + // DstReg's LI. + LIS.ReplaceMachineInstrInMaps(MI, *FirstMI); + MI.eraseFromParent(); + // As we don't handle all registers now (selective LI filter), + // We should make sure that all LiveIntervals are correct. + // If we don't repair, MI will compose the LIs of some registers, + // what is not correct because MI was deleted. + repairLiveIntervals(RegistersToRepair, VRM, LRM, LIS); +} + +/// Return a mask of all the lanes that are live at \p Index +LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, + const LiveIntervals &LIS) { + const LiveInterval &LI = LIS.getInterval(Reg); + if (!LI.hasSubRanges()) + return LaneBitmask::getAll(); + + LaneBitmask LiveLanes; + for (const LiveInterval::SubRange &SubLI : LI.subranges()) { + if (SubLI.liveAt(Index)) + LiveLanes |= SubLI.LaneMask; + } + return LiveLanes; +} + +void rewriteSuperReg(Register Reg, std::optional AssignedPhysReg, + SmallSet &SubRegs, MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { + LLVM_DEBUG(dbgs() << "Rewriting " << printReg(Reg, &TRI, 0, &MRI) << '\n'); + MachineFunction &MF = VRM.getMachineFunction(); + auto *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + + // Collect all the subreg indices to rewrite as independent vregs. + SmallMapVector SubRegToVReg; + const TargetRegisterClass *SuperRC = MRI.getRegClass(Reg); + assert(!SubRegs.empty()); + for (int SubReg : SubRegs) { + const TargetRegisterClass *SubRC = + AssignedPhysReg.has_value() + ? TRI.getSubRegisterClass(SuperRC, SubReg) + : TRI.getLargestLegalSuperClass( + TRI.getSubRegisterClass(SuperRC, SubReg), MF); + SubRegToVReg[SubReg] = MRI.createVirtualRegister(SubRC); + } + + // Rewrite full copies into multiple copies using subregs + for (MachineInstr &MI : make_early_inc_range(MRI.reg_instructions(Reg))) { + if (MI.isFullCopy()) + AIESuperRegUtils::rewriteFullCopy( + MI, TRI.getSubRegSplit(MRI.getRegClass(Reg)->getID()), LIS, *TII, TRI, + VRM, LRM); + } + + LLVM_DEBUG(dbgs() << " Splitting range " << LIS.getInterval(Reg) << "\n"); + for (MachineOperand &RegOp : make_early_inc_range(MRI.reg_operands(Reg))) { + LLVM_DEBUG(dbgs() << " Changing " << *RegOp.getParent()); + int SubReg = RegOp.getSubReg(); + assert(SubReg); + RegOp.setReg(SubRegToVReg[SubReg]); + RegOp.setSubReg(0); + + // There might have been a write-undefined due to only writing one sub-lane. + // Now that each sub-lane has its own VReg, the qualifier is invalid. + if (RegOp.isDef()) { + RegOp.setIsUndef(false); + // Also unset correctly the dead flag if the instruction + // is not the dead slot in the live range (the def is still alive). + LiveInterval &LI = LIS.getInterval(Reg); + MachineInstr *DefMI = RegOp.getParent(); + SlotIndex Def = LIS.getInstructionIndex(*DefMI); + LiveRange::iterator I = LI.FindSegmentContaining(Def); + if (I->end != Def.getDeadSlot()) + RegOp.setIsDead(false); + } + + // Make sure the right reg class is applied, some MIs might use compound + // classes with both 20 and 32 bits registers. + const TargetRegisterClass *OpRC = TII->getRegClass( + RegOp.getParent()->getDesc(), RegOp.getParent()->getOperandNo(&RegOp), + &TRI, VRM.getMachineFunction()); + MRI.constrainRegClass(SubRegToVReg[SubReg], OpRC); + + LLVM_DEBUG(dbgs() << " to " << *RegOp.getParent()); + } + + VRM.grow(); + LIS.removeInterval(Reg); + + for (auto &[SubRegIdx, VReg] : SubRegToVReg) { + LiveInterval &SubRegLI = LIS.getInterval(VReg); + LLVM_DEBUG(dbgs() << " Assigning Range: " << SubRegLI << '\n'); + + // By giving an independent VReg to each lane, we might have created + // multiple separate components. Give a VReg to each separate component. + SmallVector LIComponents; + LIS.splitSeparateComponents(SubRegLI, LIComponents); + LIComponents.push_back(&SubRegLI); + VRM.grow(); + + if (AssignedPhysReg.has_value()) { + MCRegister SubPhysReg = TRI.getSubReg(*AssignedPhysReg, SubRegIdx); + for (LiveInterval *LI : LIComponents) { + LRM.assign(*LI, SubPhysReg); + VRM.setRequiredPhys(LI->reg(), SubPhysReg); + LLVM_DEBUG(dbgs() << " Assigned " << printReg(LI->reg()) << "\n"); + } + } + } + + // Announce new VRegs so DBG locations can be updated. + auto NewVRegs = SmallVector(llvm::map_range( + SubRegToVReg, [&](auto &Mapping) { return Mapping.second; })); + DebugVars.splitRegister(Reg, NewVRegs, LIS); +} + +bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, + const Register &R) { + + return llvm::any_of( + MRI.use_nodbg_instructions(R), [&](const MachineInstr &MI) { + auto &TII = *static_cast( + MI.getMF()->getSubtarget().getInstrInfo()); + + // We should recognize both cases, with and without splitting. A 2D/3D + // instruction will always be split or splittable. + return TII.getOpcodeWithTupleOperands(MI.getOpcode()).has_value() || + TII.getOpcodeWithAtomicOperands(MI.getOpcode()).has_value(); + }); +} + +void repairLiveIntervals(SmallSet &RegistersToRepair, + VirtRegMap &VRM, LiveRegMatrix &LRM, + LiveIntervals &LIS) { + for (Register R : RegistersToRepair) { + + if (!LIS.hasInterval(R)) + continue; + + if (VRM.hasPhys(R)) { + const MCRegister PhysReg = VRM.getPhys(R); + const LiveInterval &OldLI = LIS.getInterval(R); + LRM.unassign(OldLI); + LIS.removeInterval(R); + const LiveInterval &LI = LIS.createAndComputeVirtRegInterval(R); + LRM.assign(LI, PhysReg); + } else { + LIS.removeInterval(R); + LIS.createAndComputeVirtRegInterval(R); + } + + // After recomputing, shrink the interval to remove any invalid segments + // This is important for registers with undefined definitions. + LIS.shrinkToUses(&LIS.getInterval(R)); + } +} + +} // namespace llvm::AIESuperRegUtils diff --git a/llvm/lib/Target/AIE/AIESuperRegUtils.h b/llvm/lib/Target/AIE/AIESuperRegUtils.h new file mode 100644 index 000000000000..d2e8f2ab9b7e --- /dev/null +++ b/llvm/lib/Target/AIE/AIESuperRegUtils.h @@ -0,0 +1,82 @@ +//===-- AIESuperRegUtils.h ------------------------------------------------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// +// +// This file contains helper functions to work with 2D/3D composite registers. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AIE_AIESUPERREGUTILS_H +#define LLVM_LIB_TARGET_AIE_AIESUPERREGUTILS_H + +#include "llvm/ADT/SmallSet.h" +#include + +namespace llvm { +class Register; +class MachineRegisterInfo; +struct AIEBaseRegisterInfo; +class MachineInstr; +class LiveIntervals; +class TargetInstrInfo; +class TargetRegisterInfo; +struct LaneBitmask; +class SlotIndex; +class SlotIndexes; +class VirtRegMap; +class LiveRegMatrix; +class LiveDebugVariables; +} // namespace llvm + +namespace llvm::AIESuperRegUtils { + +/// Determines if a composite register can be safely decomposed into its +/// subregisters by analyzing all uses. A register is rewritable if all uses +/// either access specific subregisters or are full copies where both operands +/// are also rewritable. Returns the set of subregister indices that can be +/// used for rewriting, or an empty set if decomposition is not possible. +/// Physical registers and registers without subregister splits cannot be +/// rewritten. +/// +/// Returns the subreg indices that can be used to rewrite \p Reg into smaller +/// regs. Returns {} if the rewrite isn't possible. +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + std::set &VisitedVRegs); + +SmallSet getRewritableSubRegs(Register Reg, + const MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI); + +/// Rewrite a full copy into multiple copies using the subregs in \p CopySubRegs +void rewriteFullCopy(MachineInstr &MI, const std::set &CopySubRegs, + LiveIntervals &LIS, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM); + +/// Return a mask of all the lanes that are live at \p Index +LaneBitmask getLiveLanesAt(SlotIndex Index, Register Reg, + const LiveIntervals &LIS); + +void rewriteSuperReg(Register Reg, std::optional AssignedPhysReg, + SmallSet &SubRegs, MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, VirtRegMap &VRM, + LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars); + +bool isRegUsedBy2DOr3DInstruction(const MachineRegisterInfo &MRI, + const Register &R); + +void repairLiveIntervals(SmallSet &RegistersToRepair, + VirtRegMap &VRM, LiveRegMatrix &LRM, + LiveIntervals &LIS); + +} // namespace llvm::AIESuperRegUtils + +#endif diff --git a/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp new file mode 100644 index 000000000000..dbf070c91a3e --- /dev/null +++ b/llvm/lib/Target/AIE/AIEUnallocatedSuperRegRewriter.cpp @@ -0,0 +1,235 @@ +//===-- AIEUnallocatedSuperRegRewriter.cpp - Constrain tied sub-registers -===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +#include "AIEBaseInstrInfo.h" +#include "AIEBaseRegisterInfo.h" +#include "AIESuperRegUtils.h" + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LiveDebugVariables.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aie-ra-prepare" + +namespace { + +using RegRewriteInfo = std::vector>>; + +/// Split large unallocated compound registers into multiple new smaller vregs +/// Than can be allocated to scalar registers. +class AIEUnallocatedSuperRegRewriter : public MachineFunctionPass { + +public: + static char ID; + AIEUnallocatedSuperRegRewriter() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; + +/// Identify unallocated virtual registers that can be split into subregisters. +/// Returns a list of candidate registers with their rewritable subregister +/// indices, excluding unused registers and those already assigned to physical +/// registers. +static RegRewriteInfo getRewriteCandidates(MachineRegisterInfo &MRI, + const AIEBaseRegisterInfo &TRI, + VirtRegMap &VRM) { + RegRewriteInfo RegistersToRewrite; + for (unsigned VRegIdx = 0, End = MRI.getNumVirtRegs(); VRegIdx != End; + ++VRegIdx) { + const Register Reg = Register::index2VirtReg(VRegIdx); + + // Ignore un-used registers + if (MRI.reg_nodbg_empty(Reg) || VRM.hasPhys(Reg)) + continue; + + SmallSet RewritableSubRegs = + AIESuperRegUtils::getRewritableSubRegs(Reg, MRI, TRI); + + if (RewritableSubRegs.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Candidate " << printReg(Reg, &TRI, 0, &MRI) << ":" + << printRegClassOrBank(Reg, MRI, &TRI) << '\n'); + + RegistersToRewrite.push_back({Reg, RewritableSubRegs}); + } + + LLVM_DEBUG(dbgs() << "Found " << RegistersToRewrite.size() + << " candidate register(s) for rewriting\n"); + + return RegistersToRewrite; +} + +/// Split candidate registers into independent virtual registers for each +/// subregister. Each composite register is rewritten using its subregister +/// indices, with live intervals and debug information updated accordingly. +void rewriteCandidates(RegRewriteInfo &RegistersToRewrite, + MachineRegisterInfo &MRI, const AIEBaseRegisterInfo &TRI, + VirtRegMap &VRM, LiveRegMatrix &LRM, LiveIntervals &LIS, + SlotIndexes &Indexes, LiveDebugVariables &DebugVars) { + + LLVM_DEBUG(dbgs() << "Rewriting " << RegistersToRewrite.size() + << " candidate register(s)\n"); + + for (auto [VReg, SubRegs] : RegistersToRewrite) { + LLVM_DEBUG(dbgs() << " Rewriting " << printReg(VReg, &TRI, 0, &MRI) + << " into " << SubRegs.size() << " subregister(s)\n"); + AIESuperRegUtils::rewriteSuperReg( + VReg, /*std::optional AssignedPhysReg = */ {}, SubRegs, MRI, + TRI, VRM, LRM, LIS, Indexes, DebugVars); + } +} + +/// Unbundle COPY/KILL instruction bundles for registers being rewritten. +/// Bundled instructions are separated into individual instructions with updated +/// slot indexes, and live intervals are repaired for affected registers. +static void expandCopyBundles(RegRewriteInfo &RegistersToRewrite, + MachineRegisterInfo &MRI, SlotIndexes &Indexes, + LiveIntervals &LIS, VirtRegMap &VRM, + LiveRegMatrix &LRM) { + + SmallSet RegistersToRepair; + for (auto [VReg, SubRegs] : RegistersToRewrite) { + + for (MachineInstr &MI : MRI.def_instructions(VReg)) { + + // Finding the last instruction in a COPY/KILL bundle (which has a + // predecessor but no successor). + if (!MI.isBundledWithPred() || MI.isBundledWithSucc()) + continue; + + SmallVector MIs({&MI}); + + // Walking backwards through the bundle to collect all bundled + // instructions. + // Only do this when the complete bundle is made out of COPYs and KILLs. + MachineBasicBlock &MBB = *MI.getParent(); + for (MachineBasicBlock::reverse_instr_iterator + I = std::next(MI.getReverseIterator()), + E = MBB.instr_rend(); + I != E && I->isBundledWithSucc(); ++I) { + if (!I->isCopy() && !I->isKill()) + break; + MIs.push_back(&*I); + } + + // Unbundling them one by one from the end. + MachineInstr *FirstMI = MIs.back(); + MachineInstr *BundleStart = FirstMI; + for (MachineInstr *BundledMI : llvm::reverse(MIs)) { + // If instruction is in the middle of the bundle, move it before the + // bundle starts, otherwise, just unbundle it. When we get to the last + // instruction, the bundle will have been completely undone. + if (BundledMI != BundleStart) { + BundledMI->removeFromBundle(); + MBB.insert(BundleStart, BundledMI); + } else if (BundledMI->isBundledWithSucc()) { + BundledMI->unbundleFromSucc(); + BundleStart = &*std::next(BundledMI->getIterator()); + } + + if (BundledMI != FirstMI) { + Indexes.insertMachineInstrInMaps(*BundledMI); + RegistersToRepair.insert(BundledMI->getOperand(0).getReg()); + RegistersToRepair.insert(BundledMI->getOperand(1).getReg()); + BundledMI->getOperand(0).setIsInternalRead(false); + } + } + } + } + + AIESuperRegUtils::repairLiveIntervals(RegistersToRepair, VRM, LRM, LIS); +} + +bool AIEUnallocatedSuperRegRewriter::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(llvm::dbgs() << "*** Splitting unallocated super-registers: " + << MF.getName() << " ***\n"); + + MachineRegisterInfo &MRI = MF.getRegInfo(); + VirtRegMap &VRM = getAnalysis().getVRM(); + LiveRegMatrix &LRM = getAnalysis().getLRM(); + LiveIntervals &LIS = getAnalysis().getLIS(); + SlotIndexes &Indexes = getAnalysis().getSI(); + LiveDebugVariables &DebugVars = + getAnalysis().getLDV(); + auto &TRI = + *static_cast(MRI.getTargetRegisterInfo()); + + LLVM_DEBUG(dbgs() << "Identifying rewrite candidates...\n"); + RegRewriteInfo RegistersToRewrite = getRewriteCandidates(MRI, TRI, VRM); + + if (RegistersToRewrite.empty()) { + LLVM_DEBUG(dbgs() << "No candidates found, skipping rewrite\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Expanding copy bundles...\n"); + expandCopyBundles(RegistersToRewrite, MRI, Indexes, LIS, VRM, LRM); + + LLVM_DEBUG(dbgs() << "Performing register rewrites...\n"); + rewriteCandidates(RegistersToRewrite, MRI, TRI, VRM, LRM, LIS, Indexes, + DebugVars); + + LLVM_DEBUG(dbgs() << "Successfully rewrote " << RegistersToRewrite.size() + << " register(s)\n"); + + return !RegistersToRewrite.empty(); +} + +} // end anonymous namespace + +char AIEUnallocatedSuperRegRewriter::ID = 0; +char &llvm::AIEUnallocatedSuperRegRewriterID = + AIEUnallocatedSuperRegRewriter::ID; + +INITIALIZE_PASS(AIEUnallocatedSuperRegRewriter, + "aie-unallocated-superreg-rewrite", + "AIE unallocated super-reg rewrite", false, false) + +llvm::FunctionPass *llvm::createAIEUnallocatedSuperRegRewriter() { + return new AIEUnallocatedSuperRegRewriter(); +} diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index 1820b3473814..5e5f42d9e4ab 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -124,6 +124,7 @@ add_llvm_target(AIECodeGen AIESubRegConstrainer.cpp AIESWPSolver.cpp AIESuperRegRewriter.cpp + AIESuperRegUtils.cpp AIETargetObjectFile.cpp AIE2AsmPrinter.cpp AIE2FrameLowering.cpp @@ -140,6 +141,7 @@ add_llvm_target(AIECodeGen AIE2TargetMachine.cpp AIE2TargetTransformInfo.cpp AIETiedRegOperands.cpp + AIEUnallocatedSuperRegRewriter.cpp ReservedRegsLICM.cpp AIEOutlineMemoryGEP.cpp AIEWawRegRewriter.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index 7a041a7e20db..77bc2db140e5 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -14,7 +14,9 @@ #include "AIE2PTargetMachine.h" #include "AIE2PTargetTransformInfo.h" +#include "AIESuperRegUtils.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" using namespace llvm; extern cl::opt EnableStagedRA; @@ -25,6 +27,7 @@ extern cl::opt EnableAddressChaining; extern cl::opt EnableGlobalPtrModOptimizer; extern cl::opt EnableWAWRegRewrite; extern cl::opt EnableAIEIfConversion; +extern cl::opt EnableFineGrainedStagedRA; void AIE2PTargetMachine::anchor() {} @@ -63,14 +66,27 @@ void AIE2PPassConfig::addPreRegBankSelect() { static bool onlyAllocate3DRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { - return AIE2P::eDSRegClass.hasSubClassEq(MRI.getRegClass(R)); + + const TargetRegisterClass *RegClass = MRI.getRegClass(R); + if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass)) + return false; + return EnableFineGrainedStagedRA + ? AIESuperRegUtils::isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } + static bool onlyAllocate3D2DRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { - return AIE2P::eDSRegClass.hasSubClassEq(MRI.getRegClass(R)) || - AIE2P::eDRegClass.hasSubClassEq(MRI.getRegClass(R)); + const TargetRegisterClass *RegClass = MRI.getRegClass(R); + if (!AIE2P::eDSRegClass.hasSubClassEq(RegClass) && + !AIE2P::eDRegClass.hasSubClassEq(RegClass)) + return false; + return EnableFineGrainedStagedRA + ? AIESuperRegUtils::isRegUsedBy2DOr3DInstruction(MRI, R) + : true; } + static bool onlyAllocateMRegisters(const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, const Register &R) { @@ -98,6 +114,8 @@ bool AIE2PPassConfig::addRegAssignAndRewriteOptimized() { addPass(createAIESuperRegRewriter()); addPass(createGreedyRegisterAllocator(onlyAllocate3D2DRegisters)); addPass(createAIESuperRegRewriter()); + if (EnableFineGrainedStagedRA) + addPass(createAIEUnallocatedSuperRegRewriter()); } addPass(createGreedyRegisterAllocator()); if (EnableWAWRegRewrite) { diff --git a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll index f24f03290521..f612a96d2362 100644 --- a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll +++ b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll @@ -247,6 +247,7 @@ ; AIE-O1-NEXT: AIE super-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: AIE super-reg rewrite +; AIE-O1-NEXT: AIE unallocated super-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: AIE waw-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator @@ -472,6 +473,7 @@ ; AIE-O23-NEXT: AIE super-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: AIE super-reg rewrite +; AIE-O23-NEXT: AIE unallocated super-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: AIE waw-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir new file mode 100644 index 000000000000..945192cd39f0 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-check-undef.mir @@ -0,0 +1,164 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 + +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# +# RUN: llc -O2 -mtriple=aie2p -start-before=greedy \ +# RUN: -stop-before=aie-unallocated-superreg-rewrite -verify-machineinstrs %s -o - | FileCheck %s + +# The goal of this test is to check if we properly insert undef flag on the def side +# of a expanded full copy. On a sub-register def operand, it refers to the part of the +# register that isn't written. A sub-register def implicitly reads the other parts of the +# register being redefined unless the flag is set, and a missing flag can +# force the related register to be inserted in liveout set of the predecessors block, +# causing dominance problems. + +--- +name: use_all_2d_regs +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: use_all_2d_regs + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_stride:ed = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_mod:ed = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:edn = MOV_PD_imm11_pseudo -1 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:em = COPY [[MOV_PD_imm11_pseudo]].sub_dim_stride + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:edj = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_size:ed = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]].sub_dim_count:ed = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:ed = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:edc = COPY [[MOV_PD_imm11_pseudo]].sub_mod + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo5:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep, [[COPY7:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo2]], [[COPY]], [[MOV_PD_imm11_pseudo1]], [[COPY1]], [[COPY7]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo6:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, [[COPY10:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY16]], [[COPY2]], [[COPY17]], [[COPY10]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo7:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, [[COPY8:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY18]], [[COPY3]], [[COPY19]], [[COPY8]] + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:edn = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo8:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo5:%[0-9]+]]:ep, [[COPY11:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo5]], [[COPY20]], [[COPY21]], [[COPY22]], [[COPY11]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo6:%[0-9]+]]:ep, [[COPY9:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo6]], [[COPY23]], [[COPY4]], [[COPY24]], [[COPY9]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo9:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo7:%[0-9]+]]:ep, [[COPY12:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo7]], [[COPY25]], [[COPY5]], [[COPY26]], [[COPY12]] + ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub_dim_count:ed = COPY [[COPY10]] { + ; CHECK-NEXT: internal [[COPY27]].sub_dim_size:ed = COPY [[COPY2]] + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:edc = COPY [[COPY14]].sub_dim_count + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:edn = COPY [[COPY14]].sub_dim_size + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:edj = COPY [[COPY14]].sub_dim_stride + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:em = COPY [[COPY14]].sub_mod + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo9:%[0-9]+]]:ep, [[COPY28:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo9]], [[COPY31]], [[COPY29]], [[COPY30]], [[COPY28]] + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:em = COPY [[COPY]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo10:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo8:%[0-9]+]]:ep, [[COPY13:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo8]], [[COPY32]], [[COPY6]], [[COPY33]], [[COPY13]] + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edn = COPY [[COPY31]] + ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub_dim_count:ed = COPY [[COPY28]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_dim_size:ed = COPY [[COPY29]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_dim_stride:ed = COPY [[COPY30]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_mod:ed = COPY [[COPY31]] + ; CHECK-NEXT: [[COPY35:%[0-9]+]]:em = COPY [[COPY31]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edc = COPY [[COPY27]].sub_dim_count { + ; CHECK-NEXT: internal [[COPY2]]:edn = COPY [[COPY27]].sub_dim_size + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY36:%[0-9]+]]:edj = COPY [[COPY1]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo10:%[0-9]+]]:ep, [[COPY15:%[0-9]+]]:edc = PADD_2D_pseudo_split [[MOV_PD_imm11_pseudo10]], [[COPY35]], [[COPY34]], [[COPY36]], [[COPY15]] + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + bb.0: + successors: %bb.1(0x80000000) + + undef %80.sub_dim_stride:ed = MOV_PD_imm11_pseudo 1 + %80.sub_mod:ed = MOV_PD_imm11_pseudo 0 + undef %105.sub_dim_size:ed = MOV_PD_imm11_pseudo -1 + %105.sub_mod:ed = COPY %80.sub_dim_stride + %105.sub_dim_stride:ed = COPY %80.sub_mod + undef %101.sub_dim_size:ed = COPY %80.sub_mod + undef %97.sub_dim_size:ed = COPY %80.sub_mod + undef %90.sub_dim_size:ed = COPY %80.sub_mod + undef %86.sub_dim_size:ed = COPY %80.sub_mod + undef %82.sub_dim_size:ed = COPY %80.sub_mod + %80.sub_dim_size:ed = COPY %80.sub_mod + %105.sub_dim_count:ed = COPY %80.sub_mod + %97.sub_dim_count:ed = COPY %80.sub_mod + %90.sub_dim_count:ed = COPY %80.sub_mod + %101.sub_dim_count:ed = COPY %80.sub_mod + undef %94.sub_dim_count:ed = COPY %80.sub_mod + %86.sub_dim_count:ed = COPY %80.sub_mod + %82.sub_dim_count:ed = COPY %80.sub_mod + %80.sub_dim_count:ed = COPY %80.sub_mod + undef %77.sub_dim_count:ed = COPY %80.sub_mod + + bb.1: + successors: %bb.1(0x80000000) + + %10:ep = MOV_PD_imm11_pseudo 0 + %18:ep = MOV_PD_imm11_pseudo 0 + %22:ep = MOV_PD_imm11_pseudo 0 + %26:ep = MOV_PD_imm11_pseudo 0 + dead %10:ep, %105.sub_dim_count:ed = PADD_2D_pseudo_split %10, %105.sub_mod, %105.sub_dim_size, %105.sub_dim_stride, %105.sub_dim_count + %30:ep = MOV_PD_imm11_pseudo 0 + %101.sub_mod:ed = COPY %105.sub_mod + %101.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %18:ep, %101.sub_dim_count:ed = PADD_2D_pseudo_split %18, %101.sub_mod, %101.sub_dim_size, %101.sub_dim_stride, %101.sub_dim_count + %34:ep = MOV_PD_imm11_pseudo 0 + %97.sub_mod:ed = COPY %105.sub_mod + %97.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %22:ep, %97.sub_dim_count:ed = PADD_2D_pseudo_split %22, %97.sub_mod, %97.sub_dim_size, %97.sub_dim_stride, %97.sub_dim_count + %94.sub_mod:ed = COPY %105.sub_mod + %94.sub_dim_size:ed = COPY %105.sub_dim_size + %38:ep = MOV_PD_imm11_pseudo 0 + %94.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %26:ep, %94.sub_dim_count:ed = PADD_2D_pseudo_split %26, %94.sub_mod, %94.sub_dim_size, %94.sub_dim_stride, %94.sub_dim_count + %90.sub_mod:ed = COPY %105.sub_mod + %90.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %30:ep, %90.sub_dim_count:ed = PADD_2D_pseudo_split %30, %90.sub_mod, %90.sub_dim_size, %90.sub_dim_stride, %90.sub_dim_count + %42:ep = MOV_PD_imm11_pseudo 0 + %86.sub_mod:ed = COPY %105.sub_mod + %86.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %34:ep, %86.sub_dim_count:ed = PADD_2D_pseudo_split %34, %86.sub_mod, %86.sub_dim_size, %86.sub_dim_stride, %86.sub_dim_count + dead %42:ep, %80.sub_dim_count:ed = PADD_2D_pseudo_split %42, %80.sub_mod, %80.sub_dim_size, %80.sub_dim_stride, %80.sub_dim_count + %82.sub_mod:ed = COPY %105.sub_mod + %46:ep = MOV_PD_imm11_pseudo 0 + %82.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %38:ep, %82.sub_dim_count:ed = PADD_2D_pseudo_split %38, %82.sub_mod, %82.sub_dim_size, %82.sub_dim_stride, %82.sub_dim_count + %77.sub_dim_size:ed = COPY %80.sub_mod + %77.sub_mod:ed = COPY %80.sub_mod + %77.sub_dim_stride:ed = COPY %105.sub_dim_stride + dead %46:ep, %77.sub_dim_count:ed = PADD_2D_pseudo_split %46, %77.sub_mod, %77.sub_dim_size, %77.sub_dim_stride, %77.sub_dim_count + PseudoJ_jump_imm %bb.1 + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll new file mode 100644 index 000000000000..785e3e6c6e49 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-cycle-in-bundle.ll @@ -0,0 +1,363 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc -mtriple=aie2p -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=FINE-GRAINED +; RUN: llc -mtriple=aie2p --aie-staged-ra-fine-grained-alloc=false %s -o - | FileCheck %s --check-prefix=COARSE-GRAINED + +; Function Attrs: nounwind readnone +define void @heavy_3d_user(i32 %dimsAI.sroa.5.0.copyload.i, i32 %dimsAI.sroa.7.0.copyload.i, i32 %dimsAI.sroa.9.0.copyload.i, i32 %dimsAO.sroa.7.0.copyload.i, i32 %dimsAO.sroa.4.0.copyload.i, i32 %dimsAO.sroa.6.0.copyload.i, i32 %dimsAO.sroa.0.0.copyload.i, i32 %dimsAO.sroa.5.0.copyload.i, i32 %dimsW.sroa.4.0.copyload.i, i32 %dimsW.sroa.6.0.copyload.i, i20 %0, i1 %1, i32 %dimsAI.sroa.11.0.copyload.i) { +; FINE-GRAINED-LABEL: heavy_3d_user: +; FINE-GRAINED: // %bb.0: // %entry +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #192; nopv +; FINE-GRAINED-NEXT: st r13, [sp, #-180]; nopx // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r14, [sp, #-184] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r15, [sp, #-188] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r9, [sp, #-164] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r10, [sp, #-168] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: mova m0, #-196; st r11, [sp, #-172]; mov p1, sp // 4-byte Folded Spill +; FINE-GRAINED-NEXT: padda [p1], m0; st p6, [sp, #-192] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st lr, [sp, #-156] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda dj0, [p1], #-4; st r8, [sp, #-160] // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda r8, [p1, #-4]; st r12, [sp, #-176]; movx r16, #0; mov p3, #0 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: lda r12, [p1, #0]; st r0, [sp, #-144]; vbcst.32 x0, r16 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: st r1, [sp, #-140]; jl p3; vmov x1, x0 // 4-byte Folded Spill +; FINE-GRAINED-NEXT: vst x0, [sp, #-128] // 64-byte Folded Spill Delay Slot 5 +; FINE-GRAINED-NEXT: vst x1, [sp, #-64]; mov p6, p0 // 64-byte Folded Spill Delay Slot 4 +; FINE-GRAINED-NEXT: mova p2, #0; st dj0, [sp, #-152]; or r13, r2, r2; mov r14, r3 // 4-byte Folded Spill Delay Slot 3 +; FINE-GRAINED-NEXT: mova p0, #0; st dj0, [sp, #-148]; or r15, r4, r4; mov r9, r5 // 4-byte Folded Spill Delay Slot 2 +; FINE-GRAINED-NEXT: mova p1, #0; or r10, r6, r6; mov r11, r7 // Delay Slot 1 +; FINE-GRAINED-NEXT: movs dn3, r10; mov dj3, r15 +; FINE-GRAINED-NEXT: mova dn1, #0; movs m3, r14; mov dj7, r9 +; FINE-GRAINED-NEXT: vlda x2, [sp, #-128]; movs dn7, r11; mov dj1, #1 // 64-byte Folded Reload +; FINE-GRAINED-NEXT: vlda x3, [sp, #-64]; movs m4, dj1; mov r3, dn1 // 64-byte Folded Reload +; FINE-GRAINED-NEXT: mova dc0, #0; movs dc2, dn1; mov r4, dn1 +; FINE-GRAINED-NEXT: lda r22, [sp, #-152]; movs dc7, dn1; mov r20, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r21, [sp, #-148]; movs dc3, dn1; mov r19, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r0, [sp, #-144]; movs dc4, dj1; mov r5, dn1 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: mova m5, #0; movs dj4, dj1; mov r6, dj1 +; FINE-GRAINED-NEXT: mova r7, #1; movs dj0, m5; movx r18, #0; vmov lfl0, x2 +; FINE-GRAINED-NEXT: lda r1, [sp, #-140]; movs dn4, m5; and r16, r12, r7; vmov lfh0, x3 // 4-byte Folded Reload +; FINE-GRAINED-NEXT: .LBB0_1: // %for.body.i +; FINE-GRAINED-NEXT: // =>This Loop Header: Depth=1 +; FINE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2 +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov dn2, r3; nopv +; FINE-GRAINED-NEXT: movs dj2, p6; nopx ; mov dn6, r3 +; FINE-GRAINED-NEXT: movs dj6, p6; mov m2, m4 +; FINE-GRAINED-NEXT: mova p1, #0; movs dc6, r4; mov r25, r18 +; FINE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d2] +; FINE-GRAINED-NEXT: mov m1, m5 +; FINE-GRAINED-NEXT: movs dj1, m5; mov dn1, r3 +; FINE-GRAINED-NEXT: movs dc1, dc0; vmov lfl1, lfl0 +; FINE-GRAINED-NEXT: movs dn5, r3; vmov lfh1, lfh0 +; FINE-GRAINED-NEXT: mova p0, #0; movs dj5, m5; mov dc5, r19 +; FINE-GRAINED-NEXT: paddb.3d [p0], d1 +; FINE-GRAINED-NEXT: mova p0, #0; mov r19, dc5 +; FINE-GRAINED-NEXT: .LBB0_2: // %for.body125.i +; FINE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1 +; FINE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2 +; FINE-GRAINED-NEXT: nopa ; nopb ; nopx ; mov dc6, dc0 +; FINE-GRAINED-NEXT: mov dn2, r3 +; FINE-GRAINED-NEXT: movs dc2, dc0; mov dj2, r0 +; FINE-GRAINED-NEXT: movs m2, r8; mov dj6, r13 +; FINE-GRAINED-NEXT: movs dn6, r1; mov r25, r18 +; FINE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2 +; FINE-GRAINED-NEXT: .L_LEnd0: +; FINE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d2]; nops ; nopx ; vmov lfh1, x3; nopv +; FINE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i +; FINE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1 +; FINE-GRAINED-NEXT: nopa ; nopb ; nops ; nopx ; mov m0, m5; nopv +; FINE-GRAINED-NEXT: movs dn0, m5; nopx ; mov m1, m3 +; FINE-GRAINED-NEXT: movs dn1, dn3; mov dj1, dj3 +; FINE-GRAINED-NEXT: mova p0, #0; movs dn5, dn7; mov dj5, dj7 +; FINE-GRAINED-NEXT: movs dc0, r5; paddb.3d [p0], d3; mov dj7, r21 +; FINE-GRAINED-NEXT: movs dj3, r22; mov dn3, m5 +; FINE-GRAINED-NEXT: movs m3, m5; mov dn7, m5 +; FINE-GRAINED-NEXT: movs dc1, dc3; xor r17, r12, r7; mov dc5, dc7 +; FINE-GRAINED-NEXT: movs dc3, r20; and r17, r17, r7; mov dc7, dc4 +; FINE-GRAINED-NEXT: mova p1, #0; movs dc4, m5; jnz r17, #.LBB0_1 +; FINE-GRAINED-NEXT: movs m3, m1; paddb.3d [p1], d3; mov dn3, dn1 // Delay Slot 5 +; FINE-GRAINED-NEXT: mova p0, #0; movs dj3, dj1; mov dn7, dn5 // Delay Slot 4 +; FINE-GRAINED-NEXT: movs dj7, dj5; paddb.3d [p0], d0; mov r20, dc3 // Delay Slot 3 +; FINE-GRAINED-NEXT: movs dc4, m5; mov dc3, dc1 // Delay Slot 2 +; FINE-GRAINED-NEXT: mova dc0, #0; movs dc7, dc5; mov r5, dc0 // Delay Slot 1 +; FINE-GRAINED-NEXT: // %bb.4: // %ret.exit +; FINE-GRAINED-NEXT: lda p6, [sp, #-192] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r15, [sp, #-188] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r14, [sp, #-184] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda lr, [sp, #-156] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r13, [sp, #-180] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r12, [sp, #-176] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r11, [sp, #-172] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r10, [sp, #-168] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r9, [sp, #-164] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: lda r8, [sp, #-160] // 4-byte Folded Reload +; FINE-GRAINED-NEXT: ret lr +; FINE-GRAINED-NEXT: nop // Delay Slot 5 +; FINE-GRAINED-NEXT: nop // Delay Slot 4 +; FINE-GRAINED-NEXT: nop // Delay Slot 3 +; FINE-GRAINED-NEXT: paddxm [sp], #-192 // Delay Slot 2 +; FINE-GRAINED-NEXT: nop // Delay Slot 1 +; +; COARSE-GRAINED-LABEL: heavy_3d_user: +; COARSE-GRAINED: // %bb.0: // %entry +; COARSE-GRAINED-NEXT: nopa ; nopb ; nops ; paddxm [sp], #384; nopv +; COARSE-GRAINED-NEXT: st r9, [sp, #-356]; nopb ; nopx // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r10, [sp, #-360] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r11, [sp, #-364] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova m0, #-388; st r12, [sp, #-368]; mov p1, sp // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: padda [p1], m0; st r13, [sp, #-372] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj0, [p1], #-4; st r14, [sp, #-376] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r15, [sp, #-380] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st p6, [sp, #-384] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj4, [p1], #-4; st lr, [sp, #-348]; movx r16, #0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r8, [sp, #-352]; vbcst.32 x0, r16 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st r0, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m0, [p1, #-4]; vst x0, [sp, #-128]; mov p6, p0 // 64-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-272] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-336]; vmov x1, x0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda r8, [p1, #0]; st dj4, [sp, #-288]; mov p3, #0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vst x1, [sp, #-64]; jl p3 // 64-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p2, #0; st dj4, [sp, #-256] // 4-byte Folded Spill Delay Slot 5 +; COARSE-GRAINED-NEXT: mova dj4, #1; st m0, [sp, #-280]; mov r9, r1 // 4-byte Folded Spill Delay Slot 4 +; COARSE-GRAINED-NEXT: mova m0, #0; st dj4, [sp, #-320]; or r10, r2, r2; mov r11, r3 // 4-byte Folded Spill Delay Slot 3 +; COARSE-GRAINED-NEXT: mova p0, #0; st m0, [sp, #-344]; or r12, r4, r4; mov r13, r5 // 4-byte Folded Spill Delay Slot 2 +; COARSE-GRAINED-NEXT: mova p1, #0; or r14, r6, r6; mov r15, r7 // Delay Slot 1 +; COARSE-GRAINED-NEXT: lda m1, [sp, #-344]; nopb ; nopxm // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj5, [sp, #-320] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; mov dn4, r15 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: st dn4, [sp, #-260]; mov dj0, r12 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-272]; mov dn0, r14 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova dc3, #0; st dn0, [sp, #-276]; mov m0, r11 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m3, [sp, #-280]; movs dj4, r13; mov dc7, dc3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m0, [sp, #-312]; st m0, [sp, #-280] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; st dj4, [sp, #-256] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m5, [sp, #-328]; movs dj6, dj5; mov m2, m1 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn0, [sp, #-308]; movs dn3, m1; mov m1, dj5 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; st m4, [sp, #-296] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn4, [sp, #-292]; st m4, [sp, #-328] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc0, m2; mov dc6, m2 +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs m0, m2; mov dc4, m2 +; COARSE-GRAINED-NEXT: st dn0, [sp, #-308] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-304] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj3, [sp, #-248]; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m2, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj6, [sp, #-224] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn0, [sp, #-340] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj0, [sp, #-336] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn4, [sp, #-324] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-252] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vlda x2, [sp, #-128]; movs dj4, dj5; mov dc4, dj5 // 64-byte Folded Reload +; COARSE-GRAINED-NEXT: vlda x3, [sp, #-64]; st dc0, [sp, #-268] // 64-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc0, [sp, #-300] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc6, [sp, #-220] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m0, [sp, #-344] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc0, [sp, #-332]; mov dn7, r9 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-320]; mov dj7, r10 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-284]; vmov lfl0, x2 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m7, [sp, #-264]; st dc4, [sp, #-316]; movx r0, #1; vmov lfh0, x3 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova r3, #0; movs dc5, m2; and r1, r8, r0; mov dc1, m2 +; COARSE-GRAINED-NEXT: .LBB0_1: // %for.body.i +; COARSE-GRAINED-NEXT: // =>This Loop Header: Depth=1 +; COARSE-GRAINED-NEXT: // Child Loop BB0_2 Depth 2 +; COARSE-GRAINED-NEXT: lda m0, [sp, #-344]; nopb ; nopx // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-332] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-320] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: lda dn1, [sp, #-244]; movs dj1, p6; mov dn1, dn3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: movs dn5, dn3; mov m2, m1 +; COARSE-GRAINED-NEXT: lda dn5, [sp, #-228]; movs dj5, p6; mov dc6, dc5 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: mova p1, #0; st m2, [sp, #-216]; mov r25, r3 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: vldb.pop.576.3d ex0, [p1, lf1, r25, d1]; st dc6, [sp, #-188] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc1, dc0; mov dj1, m0 +; COARSE-GRAINED-NEXT: movs m1, m0; mov dj5, dj4 +; COARSE-GRAINED-NEXT: st dn1, [sp, #-340]; vmov lfl1, lfl0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m5, [sp, #-232]; st dc1, [sp, #-332]; vmov lfh1, lfh0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc5, [sp, #-220]; movs dn1, dn3; mov dc1, dc3 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: st dn5, [sp, #-324] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj5, [sp, #-320] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dn5, dn3; mov dj5, m0 +; COARSE-GRAINED-NEXT: st m1, [sp, #-344] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj1, [sp, #-336] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m5, [sp, #-328] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc5, [sp, #-316] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m1, [sp, #-248] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj1, [sp, #-240] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m5, [sp, #-232] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dn1, [sp, #-244] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p0, #0; st dn5, [sp, #-228] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: paddb.3d [p0], d1; st dj5, [sp, #-224] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc1, [sp, #-236] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: mova p0, #0; st dc5, [sp, #-220] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: .LBB0_2: // %for.body125.i +; COARSE-GRAINED-NEXT: // Parent Loop BB0_1 Depth=1 +; COARSE-GRAINED-NEXT: // => This Inner Loop Header: Depth=2 +; COARSE-GRAINED-NEXT: nops ; mov dn1, dn3 +; COARSE-GRAINED-NEXT: movs m1, m3; mov dj1, dj3 +; COARSE-GRAINED-NEXT: movs dc1, dc3; mov dn5, dn7 +; COARSE-GRAINED-NEXT: movs m5, m7; mov dc5, dc7 +; COARSE-GRAINED-NEXT: movs dj5, dj7; mov r25, r3 +; COARSE-GRAINED-NEXT: movs p1, p0; vmov lfl1, x2 +; COARSE-GRAINED-NEXT: .L_LEnd0: +; COARSE-GRAINED-NEXT: nopa ; vldb.pop.576.3d ex4, [p1, lf1, r25, d1]; nops ; nopx ; vmov lfh1, x3; nopv +; COARSE-GRAINED-NEXT: // %bb.3: // %for.cond.cleanup124.i +; COARSE-GRAINED-NEXT: // in Loop: Header=BB0_1 Depth=1 +; COARSE-GRAINED-NEXT: lda m2, [sp, #-344]; nopb ; nopx // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn2, [sp, #-276] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: nop +; COARSE-GRAINED-NEXT: lda dj2, [sp, #-272] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m6, [sp, #-264] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dn6, [sp, #-260] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj6, [sp, #-256] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; mov dn0, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; movs m0, m2; mov dn4, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; st dn2, [sp, #-276] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj2, [sp, #-272] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-300]; st m6, [sp, #-264] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc4, [sp, #-284]; st dn6, [sp, #-260] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc2, [sp, #-268]; st dj6, [sp, #-256] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc6, [sp, #-252]; st dj0, [sp, #-304] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m2, [sp, #-280]; st m4, [sp, #-296] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj0, [sp, #-304]; st dn0, [sp, #-308]; mov p1, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m4, [sp, #-296]; paddb.3d [p1], d0; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn0, [sp, #-308]; st dc0, [sp, #-300] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dn4, [sp, #-292]; st dc4, [sp, #-284]; mov p0, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m2, [sp, #-344]; paddb.3d [p0], d2; st m2, [sp, #-280] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc2, [sp, #-268] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc6, [sp, #-252] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dj6, [sp, #-320]; st dj0, [sp, #-304] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st m4, [sp, #-296] // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda m6, [sp, #-328]; st dn0, [sp, #-308] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc2, [sp, #-332]; st dn4, [sp, #-292] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: mov dn2, m2 +; COARSE-GRAINED-NEXT: lda m2, [sp, #-216]; movs dj2, m2; mov dn6, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda m0, [sp, #-312]; movs dc6, m2; mov m0, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dj4, [sp, #-288]; movs dj4, dj6; mov dc4, m2 // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda dc0, [sp, #-300]; st m0, [sp, #-344] // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: lda dc6, [sp, #-188]; st dj4, [sp, #-320]; xor r2, r8, r0; mov p0, #0 // 4-byte Folded Reload4-byte Folded Spill +; COARSE-GRAINED-NEXT: st dc4, [sp, #-284]; paddb.3d [p0], d2; and r2, r2, r0 // 4-byte Folded Spill +; COARSE-GRAINED-NEXT: movs dc0, dc2; jnz r2, #.LBB0_1 +; COARSE-GRAINED-NEXT: st dc0, [sp, #-332] // 4-byte Folded Spill Delay Slot 5 +; COARSE-GRAINED-NEXT: st m0, [sp, #-312] // 4-byte Folded Spill Delay Slot 4 +; COARSE-GRAINED-NEXT: st dj4, [sp, #-288] // 4-byte Folded Spill Delay Slot 3 +; COARSE-GRAINED-NEXT: st dc0, [sp, #-300] // 4-byte Folded Spill Delay Slot 2 +; COARSE-GRAINED-NEXT: movs m1, m2; mov dc5, dc6 // Delay Slot 1 +; COARSE-GRAINED-NEXT: // %bb.4: // %ret.exit +; COARSE-GRAINED-NEXT: lda p6, [sp, #-384] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r15, [sp, #-380] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r14, [sp, #-376] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda lr, [sp, #-348] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r13, [sp, #-372] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r12, [sp, #-368] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r11, [sp, #-364] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r10, [sp, #-360] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r9, [sp, #-356] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: lda r8, [sp, #-352] // 4-byte Folded Reload +; COARSE-GRAINED-NEXT: ret lr +; COARSE-GRAINED-NEXT: nop // Delay Slot 5 +; COARSE-GRAINED-NEXT: nop // Delay Slot 4 +; COARSE-GRAINED-NEXT: nop // Delay Slot 3 +; COARSE-GRAINED-NEXT: paddxm [sp], #-384 // Delay Slot 2 +; COARSE-GRAINED-NEXT: nop // Delay Slot 1 +entry: + tail call void null(ptr null, ptr null, ptr null) + %2 = trunc i32 %dimsAI.sroa.11.0.copyload.i to i20 + %3 = trunc i32 %dimsAI.sroa.5.0.copyload.i to i20 + %4 = trunc i32 %dimsAI.sroa.7.0.copyload.i to i20 + %5 = trunc i32 %dimsAI.sroa.9.0.copyload.i to i20 + %6 = trunc i32 %dimsAO.sroa.7.0.copyload.i to i20 + %7 = trunc i32 %dimsAO.sroa.4.0.copyload.i to i20 + %8 = trunc i32 %dimsAO.sroa.6.0.copyload.i to i20 + %9 = trunc i32 %dimsAO.sroa.0.0.copyload.i to i20 + %10 = trunc i32 %dimsAO.sroa.5.0.copyload.i to i20 + %11 = trunc i32 %dimsW.sroa.4.0.copyload.i to i20 + %12 = trunc i32 %dimsW.sroa.6.0.copyload.i to i20 + br label %for.body.i + +for.body.i: ; preds = %if.end239.i, %entry + %dimsAI.sroa.13.0458.i = phi i32 [ 0, %entry ], [ %40, %if.end239.i ] + %dimsAO.sroa.10.0457.i = phi i32 [ 0, %entry ], [ %29, %if.end239.i ] + %dimsAO.sroa.8.0456.i = phi i32 [ 0, %entry ], [ %27, %if.end239.i ] + %dimsW.sroa.10.0455.i = phi i32 [ 1, %entry ], [ 0, %if.end239.i ] + %dimsW.sroa.8.0454.i = phi i32 [ 0, %entry ], [ %34, %if.end239.i ] + %iterator_psum_cnt1.0452.i = phi i32 [ 0, %entry ], [ %22, %if.end239.i ] + %iterator_pout_cnt0.0451.i = phi i32 [ 0, %entry ], [ %45, %if.end239.i ] + %13 = trunc i32 0 to i20 + %14 = trunc i32 %iterator_psum_cnt1.0452.i to i20 + %15 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 0, i20 0, i20 %13, i20 0, i20 %14) + %16 = extractvalue { ptr, i20, i20 } %15, 2 + %17 = trunc i32 %dimsAI.sroa.13.0458.i to i20 + %18 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 1, i20 0, i20 %17, i20 %0, i20 0, i20 0, i20 %0) + %19 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %18, 5 + %20 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %18, 6 + br label %for.body125.i + +for.cond.cleanup124.i: ; preds = %for.body125.i + %21 = extractvalue { ptr, i20, i20 } %15, 1 + %22 = zext i20 %16 to i32 + %23 = trunc i32 %dimsAO.sroa.8.0456.i to i20 + %24 = trunc i32 %dimsAO.sroa.10.0457.i to i20 + %25 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 %6, i20 %7, i20 %8, i20 %9, i20 %23, i20 %10, i20 %24) + %26 = extractvalue { ptr, i20, i20 } %25, 1 + %27 = zext i20 %26 to i32 + %28 = extractvalue { ptr, i20, i20 } %25, 2 + %29 = zext i20 %28 to i32 + %30 = trunc i32 %dimsW.sroa.8.0454.i to i20 + %31 = trunc i32 %dimsW.sroa.10.0455.i to i20 + %32 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 %11, i20 %12, i20 0, i20 %30, i20 0, i20 %31) + %33 = extractvalue { ptr, i20, i20 } %32, 1 + %34 = zext i20 %33 to i32 + %35 = extractvalue { ptr, i20, i20 } %32, 2 + br i1 %1, label %if.else.i14, label %if.end239.i + +for.body125.i: ; preds = %for.body125.i, %for.body.i + %36 = trunc i32 0 to i20 + %37 = trunc i32 0 to i20 + %38 = tail call { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5) null, <32 x i32> zeroinitializer, i32 0, i20 %2, i20 0, i20 %36, i20 %3, i20 %4, i20 %37, i20 %5) + %39 = extractvalue { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } %38, 3 + %40 = zext i20 %39 to i32 + %41 = call i1 @llvm.loop.decrement.i32(i32 0) + br i1 %41, label %for.body125.i, label %for.cond.cleanup124.i + +if.else.i14: ; preds = %for.cond.cleanup124.i + %add.ptr.i327.i = getelementptr i8, ptr null, i20 0 + br label %if.end239.i + +if.end239.i: ; preds = %if.else.i14, %for.cond.cleanup124.i + %42 = trunc i32 %iterator_pout_cnt0.0451.i to i20 + %43 = tail call { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr null, i20 0, i20 0, i20 1, i20 0, i20 %42, i20 0, i20 0) + %44 = extractvalue { ptr, i20, i20 } %43, 1 + %45 = zext i20 %44 to i32 + %46 = extractvalue { ptr, i20, i20 } %43, 2 + br i1 %1, label %ret.exit, label %for.body.i + +ret.exit: ; preds = %if.end239.i + ret void +} + +; Function Attrs: nounwind memory(none) +declare { ptr, i20, i20 } @llvm.aie2p.add.3d(ptr, i20, i20, i20, i20, i20, i20, i20) #0 + +; Function Attrs: nounwind memory(argmem: read) +declare { ptr addrspace(5), <32 x i32>, i32, i20, i20, <64 x i8>, <8 x i8> } @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5(ptr addrspace(5), <32 x i32>, i32, i20, i20, i20, i20, i20, i20, i20) #1 + +; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn +declare i1 @llvm.loop.decrement.i32(i32) #2 + +; uselistorder directives +uselistorder ptr @llvm.aie2p.add.3d, { 3, 2, 1, 0 } +uselistorder ptr @llvm.aie2p.fifo.ld.pop.576.3d.bfp16.p5.p5, { 1, 0 } + +attributes #0 = { nounwind memory(none) } +attributes #1 = { nounwind memory(argmem: read) } +attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn } diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir index 3c040f002206..5339cba35601 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-ra-spill.mir @@ -4,44 +4,70 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs --aie-staged-ra -start-before=greedy -aie-staged-ra-fine-grained-alloc=false \ +# RUN: -stop-after=virtregrewriter %s -o - | FileCheck %s --check-prefix=RA-STAGED # RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs --aie-staged-ra -start-before=greedy -stop-after=virtregrewriter %s -o - \ -# RUN: | FileCheck %s --check-prefix=RA +# RUN: | FileCheck %s --check-prefix=RA-STAGEG-FG -# Test what happens the 2D allocation stage needs to spill, and then the +# Test what happens the 2D allocation stage needs to spill, and then the # last allocation stage needs to spill again to make space for allocating -# %7:edj = MOV_PD_imm10_pseudo 12. +# %7:edj = MOV_PD_imm10_pseudo 12. Please note that in RA-STAGEG-FG +# (FG = fine grained) we can avoid spills by using scalar registers. --- name: test_spill_2d_last_stage tracksRegLiveness: true body: | bb.1.entry: liveins: $p0, $p1, $d1, $d2, $d3, $d4, $d5, $d6, $d7 - ; RA-LABEL: name: test_spill_2d_last_stage - ; RA: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 - ; RA-NEXT: {{ $}} - ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 - ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 - ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 8 - ; RA-NEXT: ST_D_SPILL renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) - ; RA-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 - ; RA-NEXT: renamable $r0 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) - ; RA-NEXT: renamable $dc0 = COPY killed renamable $r0 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) - ; RA-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 16 - ; RA-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 20 - ; RA-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 24 - ; RA-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) - ; RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 - ; RA-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) - ; RA-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 - ; RA-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) - ; RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 + ; RA-STAGED-LABEL: name: test_spill_2d_last_stage + ; RA-STAGED: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 + ; RA-STAGED-NEXT: {{ $}} + ; RA-STAGED-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 + ; RA-STAGED-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 + ; RA-STAGED-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 8 + ; RA-STAGED-NEXT: ST_D_SPILL renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 + ; RA-STAGED-NEXT: renamable $r0 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dc0 = COPY killed renamable $r0 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 16 + ; RA-STAGED-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 20 + ; RA-STAGED-NEXT: renamable $dj0 = LDA_dms_lda_idx_imm renamable $p1, 24 + ; RA-STAGED-NEXT: renamable $dc0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.1, implicit $sp :: (store (s128) into %stack.1, align 4) + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 + ; RA-STAGED-NEXT: ST_D_SPILL killed renamable $d0, %stack.0, implicit $sp :: (store (s128) into %stack.0, align 4) + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.1, implicit $sp :: (load (s128) from %stack.1, align 4) + ; RA-STAGED-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, $m0, $dn0, $dj0, $dc0 + ; RA-STAGED-NEXT: renamable $d0 = LDA_D_SPILL %stack.0, implicit $sp :: (load (s128) from %stack.0, align 4) + ; RA-STAGED-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 + ; + ; RA-STAGEG-FG-LABEL: name: test_spill_2d_last_stage + ; RA-STAGEG-FG: liveins: $d1, $d2, $d3, $d4, $d5, $d6, $d7, $p0, $p1 + ; RA-STAGEG-FG-NEXT: {{ $}} + ; RA-STAGEG-FG-NEXT: renamable $dn0 = LDA_dms_lda_idx_imm renamable $p1, 0 + ; RA-STAGEG-FG-NEXT: renamable $m0 = LDA_dms_lda_idx_imm renamable $p1, 4 + ; RA-STAGEG-FG-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p1, 8 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = MOV_PD_imm11_pseudo 12 + ; RA-STAGEG-FG-NEXT: renamable $r1 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 + ; RA-STAGEG-FG-NEXT: renamable $dc0 = COPY killed renamable $r1 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = COPY killed renamable $r0 + ; RA-STAGEG-FG-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 16 + ; RA-STAGEG-FG-NEXT: renamable $r3 = LDA_dms_lda_idx_imm renamable $p1, 20 + ; RA-STAGEG-FG-NEXT: renamable $r2 = LDA_dms_lda_idx_imm renamable $p1, 24 + ; RA-STAGEG-FG-NEXT: renamable $r0 = LDA_dms_lda_idx_imm killed renamable $p1, 28 + ; RA-STAGEG-FG-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-STAGEG-FG-NEXT: renamable $r4 = COPY killed renamable $dc0 + ; RA-STAGEG-FG-NEXT: renamable $dc0 = COPY killed renamable $r0 + ; RA-STAGEG-FG-NEXT: renamable $dn0 = COPY killed renamable $r1 + ; RA-STAGEG-FG-NEXT: renamable $dj0 = COPY killed renamable $r2 + ; RA-STAGEG-FG-NEXT: renamable $m0 = COPY killed renamable $r3 + ; RA-STAGEG-FG-NEXT: $p0, dead $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; RA-STAGEG-FG-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $r4, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 %20:ep = COPY $p0 %21:ep = COPY $p1 undef %100.sub_dim_size:ed = LDA_dms_lda_idx_imm %21, 0 @@ -58,3 +84,4 @@ body: | %20:ep, %101.sub_dim_count:ed = PADDA_2D_split %20, %101.sub_mod, %101.sub_dim_size, %101.sub_dim_stride, %101.sub_dim_count PseudoRET implicit $lr, implicit %20, implicit %100.sub_dim_count, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7 ... + diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir new file mode 100644 index 000000000000..c276bfb65972 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-expand-copy-bundle.mir @@ -0,0 +1,167 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs -start-before=greedy \ +# RUN: -stop-after=aie-unallocated-superreg-rewrite %s -o - | FileCheck %s + +# This example exposes some bundled copies that should be expanded. Please note +# that the bundled copies related to 3d instructions should not be expanded here +# because they already have physical registers assigned (are allocated). + +--- +name: test_expand_copy_bundle +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_expand_copy_bundle + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:em_as_32bit = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:edjl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub_dim_size:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_dim_count:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:erf2 = MOV_RLC_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_size:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_mod:eds = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_dim_stride:eds = COPY [[COPY]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY]] + ; CHECK-NEXT: undef [[VBCST_32_:%[0-9]+]].sub_512_lo:vec1024 = VBCST_32 [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:ednl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edcl = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:edch = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]].sub_512_hi:vec1024 = COPY [[VBCST_32_]].sub_512_lo + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:eldfiforeg = COPY [[VBCST_32_]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:eps = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo2:%[0-9]+]]:ep, dead [[COPY11:%[0-9]+]]:edcl, [[COPY12:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo2]], [[MOV_PD_imm11_pseudo]], [[COPY1]], [[COPY]], [[COPY11]], undef %29:em_as_32bit, [[COPY2]], [[COPY14]], [[COPY12]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:eds = COPY [[COPY3]] + ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] + ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:spill_edn_to_er = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:spill_edn_to_er = COPY [[COPY5]] + ; CHECK-NEXT: undef [[COPY19:%[0-9]+]].sub_lo_dim:eds = COPY [[COPY15]].sub_lo_dim { + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_count:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_size:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_size + ; CHECK-NEXT: internal [[COPY19]].sub_hi_dim_then_sub_dim_stride:eds = COPY [[COPY15]].sub_hi_dim_then_sub_dim_stride + ; CHECK-NEXT: } + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY16:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY16:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY16:%[0-9]+]].sub_avail:epsrfldf, [[COPY19:%[0-9]+]].sub_dim_count:eds, [[COPY19:%[0-9]+]].sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split [[COPY16]].sub_ptr, [[COPY16]].sub_fifo, [[COPY16]].sub_avail, [[COPY19]].sub_mod, [[COPY19]].sub_dim_size, [[COPY19]].sub_dim_stride, [[COPY19]].sub_dim_count, undef [[COPY19]].sub_hi_dim_then_sub_mod, [[COPY19]].sub_hi_dim_then_sub_dim_size, [[COPY19]].sub_hi_dim_then_sub_dim_stride, [[COPY19]].sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:spill_edc_to_er = COPY [[COPY19]].sub_dim_count + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:spill_edc_to_er = COPY [[COPY19]].sub_hi_dim_then_sub_dim_count + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ednl = COPY [[COPY17]] { + ; CHECK-NEXT: internal [[COPY5]]:ednh = COPY [[COPY18]] + ; CHECK-NEXT: } + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:edcl = COPY [[COPY20]] + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:edch = COPY [[COPY21]] + ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY27:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY13]] + ; CHECK-NEXT: [[COPY27:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split1:%[0-9]+]]:vec576, dead [[COPY27:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY27:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY27:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY24:%[0-9]+]]:edcl, dead [[COPY26:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY27]].sub_ptr, [[COPY27]].sub_fifo, [[COPY27]].sub_avail, [[COPY22]], [[COPY4]], [[COPY23]], [[COPY24]], undef %53:em_as_32bit, [[COPY5]], [[COPY25]], [[COPY26]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY28:%[0-9]+]]:ednh = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY29:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY30:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY31:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo3:%[0-9]+]]:ep, dead [[COPY8:%[0-9]+]]:edcl, [[COPY7:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo3]], [[COPY29]], [[COPY6]], [[COPY30]], [[COPY8]], undef %37:em_as_32bit, [[COPY28]], [[COPY31]], [[COPY7]] + ; CHECK-NEXT: [[COPY32:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY33:%[0-9]+]]:ednl = COPY [[COPY28]] + ; CHECK-NEXT: [[COPY34:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY35:%[0-9]+]]:ednh = COPY [[COPY28]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY36:%[0-9]+]]:edjh = COPY [[COPY]] + ; CHECK-NEXT: dead [[MOV_PD_imm11_pseudo4:%[0-9]+]]:ep, dead [[COPY10:%[0-9]+]]:edcl, [[COPY9:%[0-9]+]]:edch = PADD_3D_pseudo_split [[MOV_PD_imm11_pseudo4]], [[COPY32]], [[COPY33]], [[COPY34]], [[COPY10]], undef %45:em_as_32bit, [[COPY35]], [[COPY36]], [[COPY9]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edcl = MOV_PD_imm11_pseudo 1 + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + bb.0: + successors: %bb.1(0x80000000) + + undef %90.sub_mod:eds = MOV_PD_imm11_pseudo 0 + %90.sub_dim_stride:eds = COPY %90.sub_mod + %90.sub_dim_size:eds = COPY %90.sub_mod + %90.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + undef %83.sub_dim_size:eds = COPY %90.sub_mod + %83.sub_dim_count:eds = COPY %90.sub_mod + %22:erf2 = MOV_RLC_imm11_pseudo 0 + %83.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + %83.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %83.sub_mod:eds = COPY %90.sub_mod + %83.sub_dim_stride:eds = COPY %90.sub_dim_stride + %83.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + undef %21.sub_512_lo:vec1024 = VBCST_32 %22 + undef %77.sub_dim_size:eds = COPY %90.sub_mod + %77.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + undef %71.sub_dim_size:eds = COPY %90.sub_mod + %71.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %71.sub_dim_count:eds = COPY %90.sub_mod + undef %66.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %66.sub_dim_count:eds = COPY %90.sub_mod + %90.sub_dim_count:eds = COPY %90.sub_mod + %90.sub_hi_dim_then_sub_dim_count:eds = COPY %90.sub_mod + %21.sub_512_hi:vec1024 = COPY %21.sub_512_lo + %64:eldfiforeg = COPY %21 + %20:eps = MOV_PD_imm11_pseudo 0 + + bb.1: + successors: %bb.1(0x80000000) + + %90.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + %8:ep = MOV_PD_imm11_pseudo 0 + dead %8:ep, %90.sub_dim_count:eds, %90.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %8, %90.sub_mod, %90.sub_dim_size, %90.sub_dim_stride, %90.sub_dim_count, undef %90.sub_hi_dim_then_sub_mod, %90.sub_hi_dim_then_sub_dim_size, %90.sub_hi_dim_then_sub_dim_stride, %90.sub_hi_dim_then_sub_dim_count + %104:eds = COPY %83 + undef %103.sub_ptr:epsrfldf = COPY %20 + %103.sub_fifo:epsrfldf = COPY %64 + %103.sub_avail:epsrfldf = COPY %22 + dead %82:vec576, dead %103.sub_ptr:epsrfldf, dead %103.sub_fifo:epsrfldf, dead %103.sub_avail:epsrfldf, %104.sub_dim_count:eds, %104.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %103.sub_ptr, %103.sub_fifo, %103.sub_avail, %104.sub_mod, %104.sub_dim_size, %104.sub_dim_stride, %104.sub_dim_count, undef %104.sub_hi_dim_then_sub_mod, %104.sub_hi_dim_then_sub_dim_size, %104.sub_hi_dim_then_sub_dim_stride, %104.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + %77.sub_mod:eds = COPY %90.sub_mod + %77.sub_dim_stride:eds = COPY %90.sub_dim_stride + %77.sub_dim_count:eds = COPY %104.sub_dim_count + %77.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + %77.sub_hi_dim_then_sub_dim_count:eds = COPY %104.sub_hi_dim_then_sub_dim_count + undef %105.sub_ptr:epsrfldf = COPY %20 + %105.sub_fifo:epsrfldf = COPY %64 + %105.sub_avail:epsrfldf = COPY %22 + dead %76:vec576, dead %105.sub_ptr:epsrfldf, dead %105.sub_fifo:epsrfldf, dead %105.sub_avail:epsrfldf, %77.sub_dim_count:eds, %77.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %105.sub_ptr, %105.sub_fifo, %105.sub_avail, %77.sub_mod, %77.sub_dim_size, %77.sub_dim_stride, %77.sub_dim_count, undef %77.sub_hi_dim_then_sub_mod, %77.sub_hi_dim_then_sub_dim_size, %77.sub_hi_dim_then_sub_dim_stride, %77.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + %33:ep = MOV_PD_imm11_pseudo 0 + %71.sub_hi_dim_then_sub_dim_size:eds = COPY %90.sub_mod + %71.sub_mod:eds = COPY %90.sub_mod + %71.sub_dim_stride:eds = COPY %90.sub_dim_stride + %71.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + dead %33:ep, %71.sub_dim_count:eds, %71.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %33, %71.sub_mod, %71.sub_dim_size, %71.sub_dim_stride, %71.sub_dim_count, undef %71.sub_hi_dim_then_sub_mod, %71.sub_hi_dim_then_sub_dim_size, %71.sub_hi_dim_then_sub_dim_stride, %71.sub_hi_dim_then_sub_dim_count + %66.sub_mod:eds = COPY %90.sub_mod + %66.sub_dim_size:eds = COPY %71.sub_hi_dim_then_sub_dim_size + %66.sub_dim_stride:eds = COPY %90.sub_dim_stride + %66.sub_hi_dim_then_sub_dim_size:eds = COPY %71.sub_hi_dim_then_sub_dim_size + %71.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + %39:ep = MOV_PD_imm11_pseudo 0 + %66.sub_hi_dim_then_sub_dim_stride:eds = COPY %90.sub_dim_stride + dead %39:ep, %66.sub_dim_count:eds, %66.sub_hi_dim_then_sub_dim_count:eds = PADD_3D_pseudo_split %39, %66.sub_mod, %66.sub_dim_size, %66.sub_dim_stride, %66.sub_dim_count, undef %66.sub_hi_dim_then_sub_mod, %66.sub_hi_dim_then_sub_dim_size, %66.sub_hi_dim_then_sub_dim_stride, %66.sub_hi_dim_then_sub_dim_count + %66.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + %90.sub_dim_count:eds = MOV_PD_imm11_pseudo 1 + PseudoJ_jump_imm %bb.1 + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir new file mode 100644 index 000000000000..f536c520d918 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/staged-rewrite-unallocated.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -O2 -mtriple=aie2p -verify-machineinstrs -start-before=greedy \ +# RUN: -stop-before=virtregrewriter %s -o - | FileCheck %s + +# This test exposes some rewriting opportunities. Please note +# that the registers directly used by the 3d instruction should not touched +# because they already have physical registers assigned (are allocated). + +--- +name: rewrite_unallocated +alignment: 16 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: rewrite_unallocated + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:erf2 = MOV_RLC_imm11_pseudo 0 + ; CHECK-NEXT: undef [[VBCST_32_:%[0-9]+]].sub_512_lo:vec1024 = VBCST_32 [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo:%[0-9]+]]:magusrc_and_magudst_and_spill_em_to_er = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:spill_edj_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:spill_edn_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:spill_edc_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:spill_edn_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:spill_edc_to_er = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:spill_edj_to_er = COPY [[COPY]] + ; CHECK-NEXT: [[VBCST_32_:%[0-9]+]].sub_512_hi:vec1024 = COPY [[VBCST_32_]].sub_512_lo + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:eldfiforeg = COPY [[VBCST_32_]] + ; CHECK-NEXT: [[MOV_PD_imm11_pseudo1:%[0-9]+]]:eps = MOV_PD_imm11_pseudo 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:edcl = COPY [[COPY2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:ednl = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:edjl = COPY [[COPY]] + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:em_as_32bit = COPY [[MOV_PD_imm11_pseudo]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:edch = COPY [[COPY4]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:ednh = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:edjh = COPY [[COPY5]] + ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub_ptr:epsrfldf = COPY [[MOV_PD_imm11_pseudo1]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_fifo:epsrfldf = COPY [[COPY6]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub_avail:epsrfldf = COPY [[MOV_RLC_imm11_pseudo]] + ; CHECK-NEXT: dead [[VLD_POP_576_3D_pseudo_split:%[0-9]+]]:vec576, dead [[COPY14:%[0-9]+]].sub_ptr:epsrfldf, dead [[COPY14:%[0-9]+]].sub_fifo:epsrfldf, dead [[COPY14:%[0-9]+]].sub_avail:epsrfldf, dead [[COPY7:%[0-9]+]]:edcl, dead [[COPY11:%[0-9]+]]:edch = VLD_POP_576_3D_pseudo_split [[COPY14]].sub_ptr, [[COPY14]].sub_fifo, [[COPY14]].sub_avail, [[COPY10]], [[COPY8]], [[COPY9]], [[COPY7]], undef %15:em_as_32bit, [[COPY12]], [[COPY13]], [[COPY11]], implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + ; CHECK-NEXT: PseudoJ_jump_imm %bb.1 + bb.0: + successors: %bb.1(0x80000000) + + %9:erf2 = MOV_RLC_imm11_pseudo 0 + undef %8.sub_512_lo:vec1024 = VBCST_32 %9 + undef %14.sub_mod:eds = MOV_PD_imm11_pseudo 0 + %14.sub_dim_stride:eds = COPY %14.sub_mod + %14.sub_dim_size:eds = COPY %14.sub_mod + %14.sub_dim_count:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_size:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_count:eds = COPY %14.sub_mod + %14.sub_hi_dim_then_sub_dim_stride:eds = COPY %14.sub_dim_stride + %8.sub_512_hi:vec1024 = COPY %8.sub_512_lo + %12:eldfiforeg = COPY %8 + %7:eps = MOV_PD_imm11_pseudo 0 + + bb.1: + successors: %bb.1(0x80000000) + + %23:eds = COPY %14 + undef %22.sub_ptr:epsrfldf = COPY %7 + %22.sub_fifo:epsrfldf = COPY %12 + %22.sub_avail:epsrfldf = COPY %9 + dead %13:vec576, dead %22.sub_ptr:epsrfldf, dead %22.sub_fifo:epsrfldf, dead %22.sub_avail:epsrfldf, dead %23.sub_dim_count:eds, dead %23.sub_hi_dim_then_sub_dim_count:eds = VLD_POP_576_3D_pseudo_split %22.sub_ptr, %22.sub_fifo, %22.sub_avail, %23.sub_mod, %23.sub_dim_size, %23.sub_dim_stride, %23.sub_dim_count, undef %23.sub_hi_dim_then_sub_mod, %23.sub_hi_dim_then_sub_dim_size, %23.sub_hi_dim_then_sub_dim_stride, %23.sub_hi_dim_then_sub_dim_count, implicit-def $srfifo_uf :: (load unknown-size from `ptr addrspace(5) null`, align 1, addrspace 5) + PseudoJ_jump_imm %bb.1 + +... diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir b/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir index a309aeb94dc9..ba8bbb824b51 100644 --- a/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir +++ b/llvm/test/CodeGen/AIE/aie2p/ra/tie-subregs-flow.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -O2 -mtriple=aie2p --issue-limit=1 --aie-bottomup-cycles=0 -verify-machineinstrs \ # RUN: -start-before=phi-node-elimination -stop-before=aie-finalize-mi-bundles \ # RUN: %s -o - | FileCheck %s @@ -181,25 +181,21 @@ body: | ; CHECK-LABEL: name: test_4_padd_scarce ; CHECK: liveins: $d2, $d3, $d4, $d5, $d6, $d7, $m0, $p0, $p1, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: frame-setup PADDXM_pstm_sp_imm 64, implicit-def $sp, implicit $sp - ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl killed $r4 - ; CHECK-NEXT: ST_dms_sts_spill killed $m1, -64, implicit $sp :: (store (s32) into %stack.0) ; CHECK-NEXT: $dc1 = MOV_alu_mv_mv_mv_scl $r3 ; CHECK-NEXT: $dn1 = MOV_alu_mv_mv_mv_scl $r1 ; CHECK-NEXT: $dj1 = MOV_alu_mv_mv_mv_scl $r2 ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl $r0 - ; CHECK-NEXT: $p0, dead $dc1 = PADDA_2D killed $p0, killed $d1 - ; CHECK-NEXT: $m1 = LDA_dms_lda_spill -64, implicit $sp :: (load (s32) from %stack.0) ; CHECK-NEXT: $m7 = MOV_alu_mv_mv_mv_scl killed $r0 ; CHECK-NEXT: $dn7 = MOV_alu_mv_mv_mv_scl killed $r1 ; CHECK-NEXT: $dj7 = MOV_alu_mv_mv_mv_scl killed $r2 ; CHECK-NEXT: $dc7 = MOV_alu_mv_mv_mv_scl killed $r3 + ; CHECK-NEXT: $p0, dead $dc1 = PADDA_2D killed $p0, killed $d1 ; CHECK-NEXT: $p2 = MOV_alu_mv_mv_mv_scl $p1 + ; CHECK-NEXT: $m1 = MOV_alu_mv_mv_mv_scl killed $r4 ; CHECK-NEXT: $dn1 = MOV_alu_mv_mv_mv_scl killed $r5 - ; CHECK-NEXT: dead $p2, $dc7 = PADDA_2D killed $p2, $d7 ; CHECK-NEXT: $dj1 = MOV_alu_mv_mv_mv_scl killed $r6 ; CHECK-NEXT: RET implicit $lr - ; CHECK-NEXT: frame-destroy PADDXM_pstm_sp_imm -64, implicit-def $sp, implicit $sp + ; CHECK-NEXT: dead $p2, $dc7 = PADDA_2D killed $p2, $d7 ; CHECK-NEXT: $dc1 = MOV_alu_mv_mv_mv_scl killed $r7 ; CHECK-NEXT: $p0, dead $dc7 = PADDA_2D killed $p0, killed $d7 ; CHECK-NEXT: $p1, dead $dc1 = PADDA_2D killed $p1, killed $d1 diff --git a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir index a5bf4107fc8a..c8d210a3b55f 100644 --- a/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir +++ b/llvm/test/CodeGen/AIE/staged-ra-rewrite.mir @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates # RUN: llc -O2 -mtriple=aie2 -verify-machineinstrs -run-pass=greedy,aie-superreg-rewrite %s -o - | FileCheck %s --check-prefix=AIE2-VREGS # RUN: llc -O2 -mtriple=aie2 -verify-machineinstrs --aie-staged-ra -start-before=greedy -stop-after=virtregrewriter %s -o - \ @@ -291,15 +291,16 @@ body: | ; AIE2P-RA-NEXT: renamable $r0 = LDA_dms_lda_idx_imm renamable $p1, 4 ; AIE2P-RA-NEXT: renamable $r1 = LDA_dms_lda_idx_imm renamable $p1, 8 ; AIE2P-RA-NEXT: renamable $r2 = LDA_dms_lda_idx_imm killed renamable $p1, 12 - ; AIE2P-RA-NEXT: renamable $dn1 = COPY killed renamable $r0 - ; AIE2P-RA-NEXT: renamable $dj1 = COPY killed renamable $r1 - ; AIE2P-RA-NEXT: renamable $dc1 = COPY killed renamable $r2 + ; AIE2P-RA-NEXT: renamable $dn0 = COPY killed renamable $r0 + ; AIE2P-RA-NEXT: renamable $m0 = COPY killed renamable $m1 + ; AIE2P-RA-NEXT: renamable $dj0 = COPY killed renamable $r1 + ; AIE2P-RA-NEXT: renamable $dc0 = COPY killed renamable $r2 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $dc1, $dj1, $dn1, $m1, $p0 + ; AIE2P-RA-NEXT: liveins: $dc0, $dj0, $dn0, $m0, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: $p0, $dc1 = PADDA_2D_split killed $p0, killed $m1, killed $dn1, killed $dj1, killed $dc1 - ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc1 + ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 bb.1.entry: liveins: $p0, $p1, $d1 %20:ep = COPY $p0 @@ -441,13 +442,16 @@ body: | ; AIE2P-RA-NEXT: successors: %bb.1(0x80000000) ; AIE2P-RA-NEXT: liveins: $dj1, $dn1, $m1, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: renamable $dc1 = MOV_PD_imm11_pseudo 0 + ; AIE2P-RA-NEXT: renamable $dc0 = MOV_PD_imm11_pseudo 0 ; AIE2P-RA-NEXT: {{ $}} ; AIE2P-RA-NEXT: bb.1: - ; AIE2P-RA-NEXT: liveins: $dc1, $dj1, $dn1, $m1, $p0 + ; AIE2P-RA-NEXT: liveins: $dc0, $dj1, $dn1, $m1, $p0 ; AIE2P-RA-NEXT: {{ $}} - ; AIE2P-RA-NEXT: $p0, $dc1 = PADDA_2D_split killed $p0, killed $m1, killed $dn1, killed $dj1, killed $dc1 - ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc1 + ; AIE2P-RA-NEXT: renamable $dn0 = COPY killed renamable $dn1 + ; AIE2P-RA-NEXT: renamable $dj0 = COPY killed renamable $dj1 + ; AIE2P-RA-NEXT: renamable $m0 = COPY killed renamable $m1 + ; AIE2P-RA-NEXT: $p0, $dc0 = PADDA_2D_split killed $p0, killed $m0, killed $dn0, killed $dj0, killed $dc0 + ; AIE2P-RA-NEXT: PseudoRET implicit $lr, implicit killed renamable $p0, implicit killed renamable $dc0 bb.1.entry: liveins: $p0, $m1, $dn1, $dj1 %20:ep = COPY $p0