diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4ff761ec19b3c..1ba8e3e2a54d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -530,6 +530,10 @@ extern char &GCNRewritePartialRegUsesID; void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &); extern char &AMDGPUWaitSGPRHazardsLegacyID; +void initializeAMDGPUHotBlockRematerializePass(llvm::PassRegistry &); +FunctionPass *createAMDGPUHotBlockRematerializePass(); +extern char &AMDGPUHotBlockRematerializeID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp new file mode 100644 index 0000000000000..b00d286c938f8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRematerialize.cpp @@ -0,0 +1,1559 @@ +//===- AMDGPUHotBlockRematerialize.cpp - AMDGPU Hot BlockRematerialize ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU hot block Rematerialize +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMIRUtils.h" +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/SlotIndexes.h" + +#define DEBUG_TYPE "amdgpu-hot-block-remat" + +using namespace llvm; + +static cl::opt + EnableAggressiveSgpr("amdgpu-remat-enable-hot-block-remat-aggressive-sgpr"); +static cl::opt TargetOccupancy("amdgpu-remat-target-occupancy"); + +namespace { + +typedef DenseSet InstSet; +typedef DenseSet BlockSet; +template using BlockMap = MapVector; + +struct RematNode { + enum class RematKind { + Candidate, // Not ready yet. + OneDefOneUse, + Clone, + }; + RematNode() + : Reg(0), DefMI(nullptr), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(0) {} + RematNode(unsigned R, MachineInstr *MI, unsigned S) + : Reg(R), DefMI(MI), InsertBlock(nullptr), InsertPointMI(nullptr), + Kind(RematKind::Candidate), Size(S) {} + Register Reg; + MachineInstr *DefMI; + MachineBasicBlock *InsertBlock; + union { + MachineInstr *InsertPointMI; + unsigned UserCount; + }; + RematKind Kind; + unsigned Size; // This is actually the Gain of the candidate. +}; + +struct BlockLiveInfo { + MachineBasicBlock *BB; + unsigned MaxSReg; + unsigned MaxVReg; + // Input live is the live reg which cross block. + const GCNRPTracker::LiveRegSet InputLive; +}; + +struct RematStatus { + unsigned TargetOcc; + unsigned TargetVLimit; + unsigned TargetSLimit; + unsigned MaxVPressure; + unsigned MaxSPressure; + unsigned InputPhysicalVPressure; + unsigned InputPhysicalSPressure; + // More occupancy can help more than latency cost to reach It. + bool MemBound; + // abs(VTargetOcc-STargetOcc) > 1. + bool NotBalance; + DenseMap MBBPressureMap; + DenseMap MBBInputLiveMap; + DenseMap MBBOutputLiveMap; + // Collect MBBs which has memory write. When move instructions cross MBB, skip + // mem inst if the MBB has memory write. To make things fast, just check + // mayStore and isBarrier. + DenseSet MemWriteMBBSet; +}; + +class AMDGPUHotBlockRematerialize : public MachineFunctionPass { + +public: + static char ID; + + DenseSet TotalUniformInsts; + DenseSet SafeToRemoveInsts; + DenseSet DivergentInsts; + void removeInst(const MachineInstr *MI) { + TotalUniformInsts.erase(MI); + SafeToRemoveInsts.erase(MI); + DivergentInsts.erase(MI); + } + + AMDGPUHotBlockRematerialize() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void applyCloneRemat(RematNode &Node, std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF); + void applyRemat(MapVector &RematMap, + std::vector &HotBlocks, + MachineDominatorTree *DT, llvm::SlotIndexes *SlotIndexes, + MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF); + bool hotBlockRemat(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, bool &IsNearTarget); + + StringRef getPassName() const override { return "AMDGPU rematerialize"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +MachineBasicBlock::iterator adjustInsertPointToAvoidSccSmash( + MachineInstr *InstructionToMove, MachineBasicBlock *MBB, + MachineBasicBlock::iterator CurrentInsertPoint, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS) { + const bool WillSmashScc = + InstructionToMove->modifiesRegister(AMDGPU::SCC, SIRI); + if (WillSmashScc) { + CurrentInsertPoint = llvm::findOrCreateInsertionPointForSccDef( + MBB, CurrentInsertPoint, SIRI, SIII, &MRI, LIS); + } + + return CurrentInsertPoint; +} + +DenseMap reduceClonedMBBs( + Register Reg, BlockMap> &UserBlocks, + DenseSet &UserMBBSet, + std::vector &HotBlocks, MachineDominatorTree *DT) { + // Collect hot blocks which Exp is live in. + DenseSet HotBlockSet; + for (BlockLiveInfo &HotBlock : HotBlocks) { + if (HotBlock.InputLive.count(Reg)) { + HotBlockSet.insert(HotBlock.BB); + } + } + + // For userBlocks which dominate all hotBlocks, don't need to clone because + // the value not cross hotBlocks when later blocks are cloned. + // For userBlocks which dominated by all hotBlocks, they could share clones + // because once after hot block, the pressure is OK. + DenseSet AfterHotRangeMBBs; + for (MachineBasicBlock *MBB : UserMBBSet) { + // Always clone in hot block. + if (HotBlockSet.count(MBB)) + continue; + + bool IsDomAllHotBlocks = true; + bool IsDomedByAllHotBlocks = true; + for (MachineBasicBlock *HotMBB : HotBlockSet) { + if (!DT->dominates(MBB, HotMBB)) + IsDomAllHotBlocks = false; + if (!DT->dominates(HotMBB, MBB)) + IsDomedByAllHotBlocks = false; + if (!IsDomAllHotBlocks && !IsDomedByAllHotBlocks) + break; + } + if (IsDomAllHotBlocks) + UserBlocks.erase(MBB); + else if (IsDomedByAllHotBlocks) + AfterHotRangeMBBs.insert(MBB); + } + + // Split after hotRange block set by domtree. + DenseMap DomMap; + if (!AfterHotRangeMBBs.empty()) { + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + for (MachineBasicBlock *MBB2 : AfterHotRangeMBBs) { + if (MBB == MBB2) + continue; + if (DT->dominates(MBB, MBB2)) { + auto &Dom = DomMap[MBB]; + Dom.insert(MBB2); + auto &Dom2 = DomMap[MBB2]; + Dom.insert(Dom2.begin(), Dom2.end()); + } + } + } + for (MachineBasicBlock *MBB : AfterHotRangeMBBs) { + auto &Dom = DomMap[MBB]; + for (MachineBasicBlock *DomedMBB : Dom) { + // Remove domedMBB. + DomMap.erase(DomedMBB); + UserMBBSet.erase(DomedMBB); + } + } + } + + return DomMap; +} + +void updateUsers(Register Reg, unsigned NewReg, bool IsSubRegDef, + SmallVector &UserMIs) { + for (MachineInstr *UseMI : UserMIs) { + for (MachineOperand &MO : UseMI->operands()) { + if (!MO.isReg()) + continue; + if (MO.getReg() == Reg) { + MO.setReg(NewReg); + if (IsSubRegDef) + MO.setSubReg(0); + } + } + } +} + +void AMDGPUHotBlockRematerialize::applyCloneRemat( + RematNode &Node, std::vector &HotBlocks, + MachineDominatorTree *DT, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, const SIRegisterInfo *SIRI, + const SIInstrInfo *SIII, LiveIntervals *LIS, MachineFunction &MF) { + Register Reg = Node.Reg; + MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); + + const MCInstrDesc &Desc = DefMI->getDesc(); + const TargetRegisterClass *RC = + SIRI->getAllocatableClass(SIII->getOpRegClass(*DefMI, 0)); + const bool IsSubRegDef = DefMI->getOperand(0).getSubReg() != 0; + + const DebugLoc &DL = DefMI->getDebugLoc(); + const unsigned OpNum = DefMI->getNumOperands(); + + Node.Kind = RematNode::RematKind::Clone; + + // Group user in same blocks. + BlockMap> UserMap; + DenseSet UserMBBSet; + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserMap[UseMI.getParent()].emplace_back(&UseMI); + UserMBBSet.insert(UseMI.getParent()); + } + + DenseMap DomMap = + reduceClonedMBBs(Reg, UserMap, UserMBBSet, HotBlocks, DT); + + for (auto UseIt : UserMap) { + MachineBasicBlock *MBB = UseIt.first; + // Skip same block uses. + if (MBB == DefMI->getParent()) + continue; + // Skip MBB which share clone from other MBBs. + if (UserMBBSet.count(MBB) == 0) + continue; + + Register NewReg = MRI.createVirtualRegister(RC); + auto NewDef = BuildMI(MF, DL, Desc).addDef(NewReg); + for (unsigned I = 1; I < OpNum; I++) + NewDef = NewDef.add(DefMI->getOperand(I)); + + MachineInstr *InsertPointMI = UseIt.second.front(); + SlotIndex LastSlot = SlotIndexes->getInstructionIndex(*InsertPointMI); + + for (MachineInstr *UseMI : UseIt.second) { + SlotIndex Slot = SlotIndexes->getInstructionIndex(*UseMI); + if (LastSlot > Slot) { + LastSlot = Slot; + InsertPointMI = UseMI; + } + } + + MachineBasicBlock::iterator InsertPoint = adjustInsertPointToAvoidSccSmash( + DefMI, InsertPointMI->getParent(), InsertPointMI, MRI, SIRI, SIII, LIS); + + for (MachineMemOperand *MO : DefMI->memoperands()) { + NewDef->addMemOperand(MF, MO); + } + + MBB->insert(InsertPoint, NewDef); + + SlotIndexes->insertMachineInstrInMaps(*NewDef); + + SmallVector &UserMIs = UseIt.second; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + + // update users in dom MBBs. + auto DomMapIt = DomMap.find(MBB); + if (DomMapIt != DomMap.end()) { + for (MachineBasicBlock *UpdateMBB : DomMapIt->second) { + SmallVector &UserMIs = UserMap[UpdateMBB]; + updateUsers(Reg, NewReg, IsSubRegDef, UserMIs); + } + } + } + if (MRI.use_empty(Reg)) { + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + } +} + +void applyOneDefOneUseRemat(RematNode &Node, MachineRegisterInfo &MRI, + SlotIndexes *SlotIndexes, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, + LiveIntervals *LIS) { + MachineInstr *DefMI = Node.DefMI; + MachineInstr *InsertPointMI = Node.InsertPointMI; + MachineBasicBlock *MBB = nullptr; + + // Find a valid insert point. + MachineBasicBlock::iterator InsertPoint; + if (InsertPointMI) { + InsertPoint = InsertPointMI->getIterator(); + MBB = InsertPointMI->getParent(); + } else { + InsertPoint = Node.InsertBlock->getFirstTerminator(); + MBB = Node.InsertBlock; + } + + InsertPoint = adjustInsertPointToAvoidSccSmash(DefMI, MBB, InsertPoint, MRI, + SIRI, SIII, LIS); + + // Move instruction to new location. + DefMI->removeFromParent(); + InsertPoint->getParent()->insert(InsertPoint, DefMI); + + // Update slot index. + SlotIndexes->removeSingleMachineInstrFromMaps(*DefMI); + SlotIndexes->insertMachineInstrInMaps(*DefMI); +} + +void AMDGPUHotBlockRematerialize::applyRemat( + MapVector &RematMap, + std::vector &HotBlocks, MachineDominatorTree *DT, + llvm::SlotIndexes *SlotIndexes, MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, const SIInstrInfo *SIII, LiveIntervals *LIS, + MachineFunction &MF) { + std::vector UpdateList; + for (auto &It : RematMap) + UpdateList.emplace_back(It.second); + + // Sort update list with slotIndex to make sure def moved before use. + // If use moved before def, It might not be the first use anymore. + std::sort(UpdateList.begin(), UpdateList.end(), + [&SlotIndexes](RematNode &I, RematNode &J) { + SlotIndex A = SlotIndexes->getInstructionIndex(*I.DefMI); + SlotIndex B = SlotIndexes->getInstructionIndex(*J.DefMI); + return A < B; + }); + + for (RematNode &Node : UpdateList) { + if (Node.Kind == RematNode::RematKind::OneDefOneUse) + applyOneDefOneUseRemat(Node, MRI, SlotIndexes, SIRI, SIII, LIS); + else if (Node.Kind == RematNode::RematKind::Clone) + applyCloneRemat(Node, HotBlocks, DT, MRI, SlotIndexes, SIRI, SIII, LIS, + MF); + } +} + +unsigned collectMBBPressure(MachineBasicBlock &MBB, LiveIntervals *LIS, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + // Skip processing current block if It has only debug instructions + if (MBB.getFirstNonDebugInstr() == MBB.end()) + return ST->getOccupancyWithNumVGPRs(0); + auto BBEnd = MBB.rbegin(); + GCNUpwardRPTracker RPTracker(*LIS); + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (!llvm::getNonDebugMBBEnd(BBEnd, MBB)) + return ST->getOccupancyWithNumVGPRs(0); + + GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[&MBB]; + RPTracker.reset(*BBEnd, &OutputLive, true); + + for (auto I = MBB.rbegin(), B = MBB.rend(); I != B;) { + MachineInstr &MI = (*I++); + RPTracker.recede(MI); + if (MI.mayStore() || (MI.isBarrier() && MI.getOpcode() != AMDGPU::S_BRANCH)) + Status.MemWriteMBBSet.insert(&MBB); + } + + GCNRegPressure RP = RPTracker.getMaxPressureAndReset(); + unsigned SPressure = RP.getMaxSGPR(); + if (SPressure > MaxSPressure) + MaxSPressure = SPressure; + if (RP.getVGPRNum(ST->hasGFX90AInsts()) > MaxVPressure) + MaxVPressure = RP.getVGPRNum(ST->hasGFX90AInsts()); + Status.MBBPressureMap[&MBB] = RP; + return RP.getOccupancy(*ST); +} + +unsigned collectFnPressure(MachineFunction &MF, LiveIntervals *LIS, + const MachineRegisterInfo &MRI, + const GCNSubtarget *ST, unsigned &MaxVPressure, + unsigned &MaxSPressure, RematStatus &Status) { + unsigned TgtOcc = ST->getOccupancyWithWorkGroupSizes(MF).second; + // If only have one block, input/ouput virtual live set are empty. + if (MF.size() > 1) { + // Build input output live reg first. + auto *SlotIndexes = LIS->getSlotIndexes(); + DenseMap MBBInputSlotMap; + DenseMap MBBOutputSlotMap; + for (MachineBasicBlock &MBB : MF) { + auto BBBegin = MBB.getFirstNonDebugInstr(); + if (BBBegin != MBB.end()) { + auto SI = SlotIndexes->getInstructionIndex(*BBBegin); + MBBInputSlotMap[&MBB] = SI; + } + + auto BBEnd = MBB.rbegin(); + + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + if (llvm::getNonDebugMBBEnd(BBEnd, MBB)) { + auto SI = SlotIndexes->getInstructionIndex(*BBEnd); + MBBOutputSlotMap[&MBB] = SI; + } + } + + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg)) + continue; + + const auto &LI = LIS->getInterval(Reg); + + // Skip local live interval to make live input/ouput faster. + if (LIS->intervalIsInOneMBB(LI)) + continue; + + for (auto InputIt : MBBInputSlotMap) { + MachineBasicBlock *MBB = InputIt.first; + auto SI = InputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBInputLiveMap[MBB][Reg] |= LiveMask; + } + + for (auto OutputIt : MBBOutputSlotMap) { + MachineBasicBlock *MBB = OutputIt.first; + auto SI = OutputIt.second; + + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); + if (LiveMask.any()) + Status.MBBOutputLiveMap[MBB][Reg] |= LiveMask; + } + } + } + + LLVM_DEBUG( + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + dbgs() << "output live"; for (auto &It : Status.MBBOutputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + } dbgs() << "input live"; + for (auto &It : Status.MBBInputLiveMap) { + unsigned Idx = It.first->getNumber(); + auto LiveReg = It.second; + dbgs() << "MBB" << Idx << ":"; + llvm::dumpLiveSet(LiveReg, SIRI); + }); + + for (auto It = MF.begin(); It != MF.end(); ++It) { + MachineBasicBlock &MBB = *It; + unsigned Occ = + collectMBBPressure(MBB, LIS, ST, MaxVPressure, MaxSPressure, Status); + if (TgtOcc > Occ) + TgtOcc = Occ; + } + return TgtOcc; +} + +RematStatus getRematStatus(MachineFunction &MF, MachineLoopInfo *MLI, + LiveIntervals *LIS, const MachineRegisterInfo &MRI, + const GCNSubtarget *ST) { + unsigned MaxSPressure = 0; + unsigned MaxVPressure = 0; + RematStatus Status; + unsigned TgtOcc = + collectFnPressure(MF, LIS, MRI, ST, MaxVPressure, MaxSPressure, Status); + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (TgtOcc >= MaxOcc) { + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = 0; + Status.TargetSLimit = 0; + Status.MaxVPressure = 0; + Status.MaxSPressure = 0; + Status.InputPhysicalVPressure = 0; + Status.InputPhysicalSPressure = 0; + Status.MemBound = false; + Status.NotBalance = false; + return Status; + } + + MaxSPressure += RegForVCC; + MaxVPressure = std::min(MaxVPressure, ST->getMaxNumVGPRs(MF)); + unsigned STgtOcc = ST->getOccupancyWithNumSGPRs(MaxSPressure); + unsigned VTgtOcc = ST->getOccupancyWithNumVGPRs(MaxVPressure); + + llvm::SchedScore TotalScore = llvm::collectLatency(MF, *ST, MLI); + bool MemBound = + TotalScore.isMemBound(TgtOcc, std::max(STgtOcc, VTgtOcc) - TgtOcc); + + bool NotBalance = false; + + const unsigned MaxOccupancy = ST->AMDGPUSubtarget::getMaxWavesPerEU(); + // Currently, only sgpr bound can be fixed with remat. + if (STgtOcc < VTgtOcc) { + unsigned BigOcc = std::max(STgtOcc, VTgtOcc); + // Change TgtOcc to in case sgpr and vgpr is not balance. + if (BigOcc > TgtOcc) { + TgtOcc = BigOcc; + NotBalance = true; + if (TgtOcc >= MaxOccupancy) + TgtOcc = MaxOccupancy - 1; + } + } + + // Collect input physical pressure. + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + unsigned VInputPressure = 0; + uint64_t SInputMask = 0; + for (const auto &Livein : MRI.liveins()) { + const Register Reg = Livein.first; + const TargetRegisterClass *RC = SIRI->getRegClassForReg(MRI, Reg); + assert(Reg.isPhysical() && "input must be physical reg"); + Register RegSize = RC->getLaneMask().getNumLanes(); + if (SIRI->isVGPR(MRI, Reg)) { + VInputPressure += RegSize; + } else { + unsigned RegIndex = SIRI->getHWRegIndex(Reg); + uint64_t Mask = ((1 << RegSize) - 1) << RegIndex; + SInputMask |= Mask; + } + } + // SGPR need to align to 4 for the 4dowrd/8dword descriptors which cause high + // pressure. + unsigned SInputPressure = 0; + uint64_t Mask = 0xf; + while (Mask != 0) { + if (Mask & SInputMask) + SInputPressure += 4; + Mask = Mask << 4; + } + + // If balanced, try next occupancy. + TgtOcc = NotBalance ? TgtOcc : (TgtOcc + 1); + + auto CC = MF.getFunction().getCallingConv(); + bool IsPsCs = CC == CallingConv::AMDGPU_CS || CC == CallingConv::AMDGPU_PS; + // For shader profiles other than ps/cs, set target profile max as 4. + if (!IsPsCs) { + TgtOcc = TgtOcc > 4 ? 4 : TgtOcc; + } + if (TargetOccupancy) + TgtOcc = TargetOccupancy; + + unsigned SLimit = ST->getMaxNumSGPRs(TgtOcc, true); + unsigned VLimit = ST->getMaxNumVGPRs(TgtOcc); + + Status.TargetOcc = TgtOcc; + Status.TargetVLimit = VLimit; + Status.TargetSLimit = SLimit; + Status.MaxVPressure = MaxVPressure; + Status.MaxSPressure = MaxSPressure; + Status.InputPhysicalVPressure = VInputPressure; + Status.InputPhysicalSPressure = SInputPressure; + Status.MemBound = MemBound; + Status.NotBalance = NotBalance; + return Status; +} + +// For case like +// %477:sreg_32_xm0 = S_AND_B32 %472.sub0:sreg_64_xexec, %304:sreg_32_xm0, +// implicit-def dead $scc; xb.uniform +// S_CMP_EQ_U32 %302:sreg_32_xm0, %475:sreg_32_xm0, implicit-def $scc; +// xb.uniform %2489:sreg_32_xm0 = S_CSELECT_B32 %477:sreg_32_xm0, 16, implicit +// killed $scc; xb.uniform +// Sink S_AND right before S_CSELECT will overwrite SCC. +// To avoid It, skip case when DefMI and UseMI has implicit define use. +bool isImplicitDefUse(MachineInstr *DefMI, MachineInstr *UseMI) { + if (DefMI->getDesc().NumImplicitDefs == 0) + return false; + + auto *TRI = DefMI->getMF()->getSubtarget().getRegisterInfo(); + for (MachineOperand &Def : DefMI->implicit_operands()) { + if (!Def.isReg()) + continue; + if (Def.isUse()) + continue; + Register Reg = Def.getReg(); + if (UseMI->readsRegister(Reg, TRI)) + return true; + } + return false; +} + +bool nearSgprSpill(unsigned MaxSPressure, const GCNSubtarget *ST, + MachineFunction &MF) { + unsigned MaxSGPR = ST->getAddressableNumSGPRs(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + Register ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg) + MaxSGPR -= 4; + + const unsigned AlignmentDelta = 3; + MaxSGPR -= AlignmentDelta; + + return MaxSPressure > MaxSGPR; +} + +// Skip live reg remated to other block. +void updateLiveInfo( + const MapVector &RematMap, + GCNRPTracker::LiveRegSet &LiveSet, + const GCNRPTracker::LiveRegSet &InputLive, const MachineBasicBlock *CurBB, + DenseMap &RPOTIndexMap) { + for (auto &It : RematMap) { + Register Reg = It.first; + // Skip reg not in live set. + if (!LiveSet.count(Reg)) + continue; + // Skip reg already in input set. + // Input set will be taken care in getReducedSize. + if (InputLive.count(Reg)) + continue; + + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is after InsertBB in Reverse post order, the def is + // still before LiveInfo.BB, It is still live. + unsigned LiveBBIndex = RPOTIndexMap[CurBB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex > InsertBBIndex) + continue; + } + // Already in remat map, don't need to check again, remove from + // candidate. + LiveSet.erase(Reg); + } +} + +// Returns the actual register saving that would be achieved by moving or +// cloning this instruction. It's essentially: +// +// size(defs) - size(uses) +// +// Note if it is not safe to move/clone this instruction, this function returns +// 0. +// +int rematGainInBits(MachineInstr *DefMI, Register Reg, + const MachineRegisterInfo &MRI, const SIRegisterInfo *SIRI, + bool IsVGPR) { + int RematSize = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + + if (MO.getReg() == AMDGPU::EXEC) + continue; + + // Don't move user of VCC. + if (MO.getReg() == AMDGPU::VCC) { + RematSize = 0; + break; + } + Register Reg = MO.getReg(); + + // Don't move physical register use. + if (Reg.isPhysical()) { + RematSize = 0; + break; + } + + if (IsVGPR != SIRI->isVGPR(MRI, Reg)) { + // Not support mix of v and s when remat now. + // TODO: count possible pressure change here. + RematSize = 0; + break; + } + bool IsSingleDef = MRI.hasOneDef(Reg); + if (!IsSingleDef) { + IsSingleDef = llvm::isSub0Sub1SingleDef(Reg, MRI); + } + + if (IsSingleDef) { + // The reg might share with other candidates, check It here. + // Count share reg in getReducedSize. + if (EnableAggressiveSgpr) { + // In case of aggressive remat, treat multi use reg as shared reg and + // ignore size of shared reg. + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + } + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + if (unsigned SubIdx = MO.getSubReg()) { + if (OpRC) + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + } + int InputSize = SIRI->getRegSizeInBits(*OpRC); + // If input not live in hotspot, move It cross hotspot should have + // less reg then DefMi. + if (RematSize > InputSize) { + RematSize -= InputSize; + continue; + } + } + + RematSize = 0; + break; + } + return RematSize; +} + +MachineBasicBlock *findNonLoopDominator(MachineBasicBlock *BB, + MachineDominatorTree *DT, + MachineLoopInfo *LI) { + while (LI->getLoopDepth(BB) > 0) { + MachineDomTreeNode *N = DT->getNode(BB); + if (N == nullptr) + return nullptr; + MachineDomTreeNode *IDom = N->getIDom(); + if (IDom == nullptr) + return nullptr; + + BB = IDom->getBlock(); + } + + return BB; +} + +MachineBasicBlock *nearestCommonDominator(MachineDominatorTree *DT, + BlockSet &Blocks) { + auto I = Blocks.begin(), E = Blocks.end(); + + MachineBasicBlock *DomB = cast(*(I++)); + while (I != E) { + MachineBasicBlock *B = cast(*(I++)); + DomB = DT->findNearestCommonDominator(DomB, B); + if (DomB == nullptr) + return nullptr; + } + // For split block like: + // bb.42: + // %632.sub2:vreg_128 = V_MOV_B32_e32 %717.sub2:vreg_128, implicit $exec, + // // implicit $exec + // %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // implicitdef $scc, implicit $exec + // + // bb.68: + //; predecessors: %bb.42 + // successors: %bb.45(0x40000000), %bb.43(0x40000000); %bb.45(50.00%), + // %bb.43(50.00%) + // + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // which is from + // bb.42: + //%129:vgpr_32 = V_MOV_B32_e32 killed %548:vgpr_32, implicit $exec, implicit + //$exec %130:sreg_64 = S_AND_SAVEEXEC_B64 %533:sreg_64, implicitdef $exec, + // SI_MASK_BRANCH %bb.43, implicit $exec + // S_BRANCH %bb.45 + // The real common dom is bb.42. + // TODO: use _term version of exec update instructions so don't need this + // anymore. + if (DomB && DomB->pred_size() == 1 && !DomB->empty()) { + // Upstreaming note: This used to be SI_MASK_BRANCH + if (DomB->begin()->getOpcode() == AMDGPU::S_CBRANCH_EXECZ) { + MachineBasicBlock *Pred = *DomB->pred_begin(); + if (Pred->succ_size() == 1 && + (Pred->empty() || !Pred->back().isBranch())) { + DomB = Pred; + } + } + } + + return DomB; +} + +MachineBasicBlock * +findInsertBlock(MachineInstr &DefMI, Register Reg, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + const MachineRegisterInfo &MRI, bool MemBound) { + + BlockSet BBSet; + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + BBSet.insert(UseMI.getParent()); + } + if (BBSet.empty()) + return nullptr; + + MachineBasicBlock *BB = *BBSet.begin(); + if (BBSet.size() > 1) { + MachineBasicBlock *BDom = nearestCommonDominator(DT, BBSet); + if (!BDom) + return nullptr; + BB = BDom; + } + // Try to find non loop dominator. + if (!MemBound) { + BB = findNonLoopDominator(BB, DT, MLI); + } + if (!BB) + return nullptr; + + // If BB is already a hot block, move to BB will not help. + // hotBlockRemat will fail It when process BB. + + // Must reachable from DefMI. + if (!llvm::reach_block(DefMI.getParent(), DT, PDT, MLI, BB)) + return nullptr; + + return BB; +} + +// Maybe expensive to be called all over the place +bool isUsedByPhi(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + for (auto &Def : DefMI->defs()) { + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Def.getReg())) { + if (UseMI.isPHI()) + return true; + } + } + return false; +} + +bool isSafeToMoveOrClone(MachineInstr *DefMI, MachineRegisterInfo &MRI) { + // Do not move PHI nodes + if (isUsedByPhi(DefMI, MRI)) + return false; + + unsigned OpNum = DefMI->getNumOperands(); + // Only move DefMI which all operand is unique def. + for (unsigned I = 0; I < OpNum; I++) { + MachineOperand &Op = DefMI->getOperand(I); + if (!Op.isReg()) + continue; + if (!Op.getReg().isPhysical() && !MRI.getUniqueVRegDef(Op.getReg()) && + !llvm::isSub0Sub1SingleDef(Op.getReg(), MRI)) { + return false; + } + } + return true; +} + +void addOneDefOneUseCandidate(std::vector *OutRematList, + int *OutRematCnt, const RematNode &Node, + MachineRegisterInfo &MRI, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + MachineLoopInfo *MLI, bool IsVGPR, + bool MemBound) { + Register Reg = Node.Reg; + MachineInstr *DefMI = Node.DefMI; + + unsigned Size = Node.Size; + MachineInstr *UseMI = &*MRI.use_nodbg_instructions(Reg).begin(); + MachineBasicBlock *InsertBB = UseMI->getParent(); + + // For VGPR, always move next to the only user to avoid wqm or exec issue. + // But doing this will cause issue when DefMI is in wqm user not in + // wqm. Disable VGPR remat for now. + // TODO: make sure single user don't need wqm. + if (!IsVGPR) { + if (MachineBasicBlock *NewInsertBB = + findInsertBlock(*DefMI, Reg, DT, PDT, MLI, MRI, MemBound)) { + if (InsertBB != NewInsertBB) { + InsertBB = NewInsertBB; + // If can find a non-loop insert block, go to the insert block. + if (DefMI->getParent() != InsertBB) { + if (!InsertBB->empty()) { + auto It = InsertBB->getFirstNonPHI(); + It = skipDebugInstructionsForward(It, InsertBB->end()); + if (It == InsertBB->end()) + UseMI = nullptr; + else + UseMI = &*It; + } + } + } + } + } + + if (IsVGPR) { + // Don't count reg in same block for valu. + if (UseMI->getParent() == DefMI->getParent()) + return; + } + + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(DefMI, UseMI)) { + return; + } + + RematNode FilteredNode = Node; + FilteredNode.InsertBlock = InsertBB; + FilteredNode.InsertPointMI = UseMI; + FilteredNode.Kind = RematNode::RematKind::OneDefOneUse; + OutRematList->emplace_back(FilteredNode); + *OutRematCnt += Size; +} + +// Build remat candidates from the registers in `CandidateRegSet`. +void buildRematCandiates(std::vector *OutCandidates, + DenseSet *PinnedRegSet, + GCNRPTracker::LiveRegSet &CandidateRegSet, + const MachineRegisterInfo &MRI, + const SIInstrInfo *SIII, const SIRegisterInfo *SIRI, + bool IsVGPR) { + + for (const auto &LiveRegIt : CandidateRegSet) { + Register Reg = LiveRegIt.first; + // Skip unsafe reg. + if (PinnedRegSet->count(Reg)) + continue; + + if (SIRI->isVGPR(MRI, Reg) != IsVGPR) + continue; + bool IsSafeCandidate = true; + MachineInstr *MI = MRI.getUniqueVRegDef(Reg); + if (MI) { + if (IsVGPR) { + // Only remat valu now. + if (!SIII->isVALU(MI->getOpcode()) && MI->getOpcode() != AMDGPU::COPY) + IsSafeCandidate = false; + if (MI->getOpcode() == AMDGPU::COPY) { + // Make sure src is unique define. + if (MI->getOperand(1).isReg() && + nullptr == MRI.getUniqueVRegDef(MI->getOperand(1).getReg())) + IsSafeCandidate = false; + } else { + // Skip convergent valu. + if (MI->isConvergent()) + IsSafeCandidate = false; + } + } + // Skip inst has more than 1 def. + if (MI->getDesc().NumDefs > 1) + IsSafeCandidate = false; + } else { + IsSafeCandidate = false; + } + + if (IsSafeCandidate) { + int Gain = rematGainInBits(MI, Reg, MRI, SIRI, IsVGPR); + if (Gain > 0) + OutCandidates->emplace_back(RematNode(Reg, MI, Gain >> 5)); + else + IsSafeCandidate = false; + } + // Save unsafe reg. + if (!IsSafeCandidate) + PinnedRegSet->insert(Reg); + } + + // Sort by gain. + std::sort(OutCandidates->begin(), OutCandidates->end(), + [](RematNode &I, RematNode &J) { return I.Size > J.Size; }); +} + +void addCloneCandidate(std::vector *OutRematList, int *OutRematCnt, + DenseSet *OutPinnedRegSet, + std::vector &&CloneList, + const MachineRegisterInfo &MRI) { + // Group user in same blocks. + std::vector UserSetList(CloneList.size()); + + for (size_t I = 0; I < CloneList.size(); I++) { + auto *Node = CloneList[I]; + Register Reg = Node->Reg; + MachineInstr *DefMI = Node->DefMI; + // Group user in same blocks. + BlockSet &UserSet = UserSetList[I]; + + for (auto UseIt = MRI.use_instr_nodbg_begin(Reg); + UseIt != MRI.use_instr_nodbg_end();) { + MachineInstr &UseMI = *(UseIt++); + UserSet.insert(UseMI.getParent()); + } + + if (UserSet.size() == 1) { + // All users are in same block with DefMI. + if (*UserSet.begin() == DefMI->getParent()) { + // Mark cannot remat for now. + // TODO: try to split if is bigger than 4 and only used once per + // channel. + OutPinnedRegSet->insert(Reg); + continue; + } + } + + int Size = Node->Size; + Size <<= 16; + // Pack userSet size to size. + Size |= UserSet.size(); + Node->UserCount = Size; + } + + std::sort(CloneList.begin(), CloneList.end(), + // Sort based on userSet size. + [](const RematNode *A, const RematNode *B) { + static constexpr int Mask = 0xffff; + return (A->UserCount & Mask) < (B->UserCount & Mask); + }); + + for (RematNode *Node : CloneList) { + Node->Kind = RematNode::RematKind::Clone; + OutRematList->emplace_back(*Node); + *OutRematCnt += Node->Size; + } +} + +// Filter `Candidates` into `OutRematList` based on whether +// safe to move, and decides on the actual type of Candidate (move vs cline). +// +// Updates `OutPinnedRegSet` with registers that cannot/should not be moved. +// +// Returns the accumulated size of all filtered candidates. +// +int filterRematCandiates(std::vector *OutRematList, + DenseSet *OutPinnedRegSet, + std::vector &&Candidates, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *MLI, + MachineRegisterInfo &MRI, bool IsVGPR, bool MemBound) { + int RematCnt = 0; + // Work one def one use first. + for (auto &Node : Candidates) { + Register Reg = Node.Reg; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMoveOrClone(DefMI, MRI)) { + OutPinnedRegSet->insert(Reg); + continue; + } + + addOneDefOneUseCandidate(OutRematList, &RematCnt, Node, MRI, DT, PDT, MLI, + IsVGPR, MemBound); + } + + if (!IsVGPR) { + std::vector CloneList; + // Try multi use case. + for (auto &Node : Candidates) { + Register Reg = Node.Reg; + if (MRI.hasOneNonDBGUse(Reg)) + continue; + + MachineInstr *DefMI = Node.DefMI; + if (!isSafeToMoveOrClone(DefMI, MRI)) { + OutPinnedRegSet->insert(Reg); + continue; + } + + // Clone for each user. + CloneList.emplace_back(&Node); + } + + addCloneCandidate(OutRematList, &RematCnt, OutPinnedRegSet, + std::move(CloneList), MRI); + } + + return RematCnt; +} + +// Calculate the reduced register pressure of RematMap w.r.t. the BB associated +// with LiveInfo. +// Returns the number of registers reduced, and the instructions associated with +// the reduction nodes into `OutReducedInsts`. +int getReducedSize(const MapVector &RematMap, + GCNRPTracker::LiveRegSet &CanidateSet, + const MachineRegisterInfo &MRI, + const BlockLiveInfo &LiveInfo, + DenseMap &RPOTIndexMap, + InstSet *OutReducedInsts) { + int ReducedSize = 0; + for (const auto &It : RematMap) { + Register Reg = It.first; + + if (!CanidateSet.count(Reg)) + continue; + + bool IsReduced = false; + auto &Node = It.second; + if (Node.Kind == RematNode::RematKind::OneDefOneUse) { + MachineBasicBlock *InsertBB = Node.InsertBlock; + // If LiveInfo.BB is before InsertBB in Reverse post order, the def is + // moved after LiveInfo.BB, It is not live anymore. + unsigned LiveBBIndex = RPOTIndexMap[LiveInfo.BB]; + unsigned InsertBBIndex = RPOTIndexMap[InsertBB]; + if (LiveBBIndex < InsertBBIndex) + IsReduced = true; + } else { + // Clone. + IsReduced = true; + // If has use in LiveInfo.BB, could not reduce from input live. + for (MachineInstr &UseMI : MRI.use_nodbg_instructions(Reg)) { + if (UseMI.getParent() == LiveInfo.BB) { + IsReduced = false; + break; + } + } + } + if (IsReduced) { + ReducedSize += Node.Size; + OutReducedInsts->insert(Node.DefMI); + } + + // Already in remat map, don't need to check again, remove from candidate. + CanidateSet.erase(Reg); + } + + return ReducedSize; +} + +static unsigned getNumLanesIn32BitReg(bool IsVgpr) { + const TargetRegisterClass *RC = + IsVgpr ? &AMDGPU::VGPR_32RegClass : &AMDGPU::SGPR_32RegClass; + return RC->LaneMask.getNumLanes(); +} + +// Calculate the amount of OVERLAPPING register pressure among all +// the instructions in `ReducedInsts`. E.g for: +// x = COPY a:sgpr_32 +// y = COPY a:sgpr_32 +// This function would return 1. +int getSharedReducedSize(const InstSet &ReducedInsts, bool IsVGPR, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI) { + int SharedSize = 0; + DenseMap SharedRegMaskMap; + for (MachineInstr *DefMI : ReducedInsts) { + for (MachineOperand &MO : DefMI->operands()) { + if (MO.isImm()) + continue; + if (!MO.isReg()) + continue; + if (MO.isDef()) + continue; + if (MO.isTied()) + continue; + Register Reg = MO.getReg(); + + if (Reg == AMDGPU::EXEC) + continue; + if (!Reg.isVirtual()) + continue; + + if (IsVGPR != SIRI->isVGPR(MRI, MO.getReg())) + // Not support mix of v and s when remat now. + continue; + + const TargetRegisterClass *OpRC = MRI.getRegClass(Reg); + const int MOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + + unsigned Mask = 0; + // FIXME: Lane mask is now in the granularity of 16-bit lanes. + if (unsigned SubIdx = MO.getSubReg()) { + OpRC = SIRI->getSubRegisterClass(OpRC, SubIdx); + int SubMOSize = SIRI->getRegSizeInBits(*OpRC) >> 5; + Mask = (1 << SubMOSize) - 1; + } else { + Mask = (1 << MOSize) - 1; + } + auto SharedRegIt = SharedRegMaskMap.find(Reg); + if (SharedRegIt == SharedRegMaskMap.end()) { + SharedRegMaskMap[Reg] = LaneBitmask(Mask); + } else { + unsigned PrevMask = SharedRegIt->second.getAsInteger(); + if (unsigned SharedMask = (PrevMask & Mask)) { + // Some thing is shared. + for (int I = 0; I < MOSize; I++) { + if (SharedMask & (1 << I)) { + SharedSize += 1; + } + } + } + LaneBitmask MoMask = LaneBitmask(Mask | PrevMask); + SharedRegMaskMap[Reg] = MoMask; + } + } + } + + const unsigned NumLanesPerReg = getNumLanesIn32BitReg(IsVGPR); + return SharedSize / NumLanesPerReg; +} + +void dumpRematMap(MapVector &RematMap, + const SIRegisterInfo *SIRI) { + dbgs() << "\n rematMap: \n"; + for (auto It : RematMap) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + dbgs() << "\n"; + } +} +int DebugBlockIndex = 42; +void dumpHotBlock(const GCNRPTracker::LiveRegSet &LiveSet, + MapVector &VRematMap, + MapVector &SRematMap, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + llvm::dumpLiveSet(LiveSet, SIRI); + dumpRematMap(VRematMap, SIRI); + dumpRematMap(SRematMap, SIRI); +} + +void dumpCandidates(std::vector &RematCandidates, int BlockIndex, + const SIRegisterInfo *SIRI) { + if (DebugBlockIndex != BlockIndex) + return; + dbgs() << "\n Candidates: \n"; + unsigned TotalSize = 0; + for (RematNode &Node : RematCandidates) { + dbgs() << printReg(Node.Reg, SIRI) << " size:" << Node.Size; + dbgs() << "\n"; + TotalSize += Node.Size; + } + dbgs() << "Total Size:" << TotalSize << "\n"; +} + +// A heuristic number for keeping the target SGPR number away from the limit. +constexpr unsigned SgprLimitBias = 10; + +bool AMDGPUHotBlockRematerialize::hotBlockRemat(MachineFunction &MF, + MachineLoopInfo *MLI, + LiveIntervals *LIS, + MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, + bool &IsNearTarget) { + const GCNSubtarget *ST = &MF.getSubtarget(); + + const SIInstrInfo *SIII = ST->getInstrInfo(); + const SIRegisterInfo *SIRI = ST->getRegisterInfo(); + + ReversePostOrderTraversal RPOT(&MF); + DenseMap RPOTIndexMap; + for (const MachineBasicBlock *MBB : RPOT) + RPOTIndexMap[MBB] = RPOTIndexMap.size(); + + auto &MRI = MF.getRegInfo(); + + bool IsUpdated = false; + RematStatus Status = getRematStatus(MF, MLI, LIS, MRI, ST); + + const unsigned MaxOcc = ST->getWavesPerEU(MF.getFunction()).second; + if (Status.TargetOcc >= MaxOcc) + return false; + + // Early checks + { + int InitialRematSCnt = Status.MaxSPressure - Status.TargetSLimit; + // when agressive sgpr remat, reserve some for allocation lost. + if (EnableAggressiveSgpr) + InitialRematSCnt += SgprLimitBias; + + bool InitialIsSGPRSpill = false; + if (InitialRematSCnt > 0) + InitialIsSGPRSpill = nearSgprSpill(Status.MaxSPressure, ST, MF); + + const bool InitialIsForceRematSgpr = + InitialIsSGPRSpill || Status.NotBalance; + + // If bound by lds, skip. + if (Status.TargetOcc > ST->getOccupancyWithWorkGroupSizes(MF).second && + !InitialIsForceRematSgpr) + return false; + } + + MachineBasicBlock *EntryMBB = &MF.front(); + + auto *SlotIndexes = LIS->getSlotIndexes(); + + // Reg which already marked remat. + MapVector VRematMap; + MapVector SRematMap; + // Reg which cannot move around to remat. + DenseSet PinnedRegSet; + std::vector HotBlocks; + for (auto It = po_begin(EntryMBB); It != po_end(EntryMBB); It++) { + MachineBasicBlock *MBB = *It; + auto &RP = Status.MBBPressureMap[MBB]; + // ignore block not hot. + + if (RP.getVGPRNum(ST->hasGFX90AInsts()) < Status.TargetVLimit && + (RP.getMaxSGPR() + RegForVCC + Status.InputPhysicalSPressure) < + Status.TargetSLimit) + continue; + // Collect reg pressure. + unsigned MaxVPressure = 0; + unsigned MaxSPressure = 0; + const GCNRPTracker::LiveRegSet InputLive = Status.MBBInputLiveMap[MBB]; + + const GCNRPTracker::LiveRegSet OutputLive = Status.MBBOutputLiveMap[MBB]; + LLVM_DEBUG( + dumpHotBlock(InputLive, VRematMap, SRematMap, MBB->getNumber(), SIRI)); + + GCNDownwardRPTracker Tracker(*LIS); + + Tracker.reset(*MBB->begin(), &InputLive); + + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + Tracker.advance(); + auto LISLR = Tracker.getLiveRegs(); + // Update live set for things already remated. + updateLiveInfo(VRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + updateLiveInfo(SRematMap, LISLR, InputLive, MBB, RPOTIndexMap); + + const GCNRPTracker::LiveRegSet &LiveSet = LISLR; + unsigned VPressure = 0; + unsigned SPressure = 0; + collectLiveSetPressure(LiveSet, MRI, SIRI, VPressure, SPressure); + if (MaxVPressure < VPressure) + MaxVPressure = VPressure; + if (MaxSPressure < SPressure) + MaxSPressure = SPressure; + } + MaxSPressure += RegForVCC + Status.InputPhysicalSPressure; + if (MaxVPressure <= Status.TargetVLimit && + MaxSPressure <= Status.TargetSLimit) + continue; + + // Build block live info. + // Use outputLive for EntryMBB. + BlockLiveInfo LiveInfo = {MBB, MaxSPressure, MaxVPressure, + MBB != EntryMBB ? InputLive : OutputLive}; + // Skip entry block when save hotBlock to reduce clone because not clone in + // entry block. + if (MBB != EntryMBB) + HotBlocks.emplace_back(LiveInfo); + GCNRPTracker::LiveRegSet CandidateRegs = LiveInfo.InputLive; + + // Update reg pressure based on remat list. + InstSet VReducedInsts; + InstSet SReducedInsts; + int VReduced = getReducedSize(VRematMap, CandidateRegs, MRI, LiveInfo, + RPOTIndexMap, &VReducedInsts); + int SReduced = getReducedSize(SRematMap, CandidateRegs, MRI, LiveInfo, + RPOTIndexMap, &SReducedInsts); + + // Calculate size need to be remat for this BB. + const int RematVCnt = MaxVPressure - VReduced - Status.TargetVLimit; + const int RematSCnt = MaxSPressure - SReduced - Status.TargetSLimit; + + bool IsSGPRSpill = false; + if (RematSCnt > 0) + IsSGPRSpill = nearSgprSpill(MaxSPressure, ST, MF); + + bool IsForceRematSgpr = IsSGPRSpill || Status.NotBalance; + // Try to add candidates into remat list. + + int NewRematSCnt = 0; + if (RematSCnt > 0) { + // Build candidate nodes. + std::vector SRematCandidates; + buildRematCandiates(&SRematCandidates, &PinnedRegSet, CandidateRegs, MRI, + SIII, SIRI, /*IsVGPR*/ false); + + LLVM_DEBUG(dumpCandidates(SRematCandidates, MBB->getNumber(), SIRI)); + std::vector SRematList; + // Filter candidates. + NewRematSCnt = + filterRematCandiates(&SRematList, &PinnedRegSet, + std::move(SRematCandidates), DT, PDT, MLI, MRI, + /*IsVGPR*/ false, Status.MemBound); + if (NewRematSCnt > RematSCnt) { + // Has enough remat node to cover rematCnt. + int RematCnt = 0; + for (RematNode &Node : SRematList) { + SRematMap[Node.Reg] = Node; + RematCnt += Node.Size; + // Stop if the size had reached the required amount, unless + // aggressive is set. + if (RematCnt > RematSCnt && !EnableAggressiveSgpr) + break; + } + NewRematSCnt = 0; + } else { + for (RematNode &Node : SRematList) { + SReducedInsts.insert(Node.DefMI); + } + // Check shared size. These are reg uses that are shared among all the + // instructions. The overlap will not actually contribute to the + // pressure increase when an instruction is moved/cloned, so it can be + // treated as a gain. + int SharedReducedSize = + getSharedReducedSize(SReducedInsts, /*IsVGPR*/ false, MRI, SIRI); + + int LocalGains = 0; + if (((NewRematSCnt + SharedReducedSize) + (int)SgprLimitBias) >= + RematSCnt) { + for (RematNode &Node : SRematList) + SRematMap[Node.Reg] = Node; + } else { + if (!IsForceRematSgpr) + return false; + for (RematNode &Node : SRematList) + SRematMap[Node.Reg] = Node; + // Find local one def one use candidates. + for (MachineInstr &MI : *MBB) { + if (MI.isDebugInstr()) + continue; + if (MI.getDesc().NumDefs != 1) + continue; + MachineOperand &DstMO = MI.getOperand(0); + Register Reg = DstMO.getReg(); + if (!SIRI->isSGPRReg(MRI, Reg)) + continue; + if (!MRI.hasOneNonDBGUse(Reg)) + continue; + if (!MRI.hasOneDef(Reg)) + continue; + if (Reg.isPhysical()) + continue; + MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(Reg); + if (UseMI.getParent() != MBB) + continue; + int Gain = rematGainInBits(&MI, Reg, MRI, SIRI, + /*IsVGPR*/ false); + if (Gain > 0) { + // Skip case when DefMI has implicit define which used by UseMI. + if (isImplicitDefUse(&MI, &UseMI)) + continue; + RematNode Node = {Reg, &MI, (unsigned)Gain >> 5}; + Node.InsertPointMI = &UseMI; + Node.Kind = RematNode::RematKind::OneDefOneUse; + SRematMap[Reg] = Node; + LocalGains += Node.Size; + } + } + } + NewRematSCnt = + RematSCnt - NewRematSCnt - SharedReducedSize - LocalGains; + } + } + // If works, continue. + + // Collect live range from hot inst. + // find common live range in hot insts. + // Remat these common live range. + // Apply the remat. + + int NewRematVCnt = 0; + if (RematVCnt > 0) { + // TODO: V remat. + } + + bool NeedSRemat = RematSCnt > 0; + bool NeedVRemat = RematVCnt > 0; + // If sgpr spill, always do remat. + bool IsSRematOK = + (NewRematSCnt <= 0 && !SRematMap.empty()) || IsForceRematSgpr; + bool IsVRematOK = + (Status.NotBalance || NewRematVCnt <= 0) && !VRematMap.empty(); + if (NeedSRemat && NeedVRemat) { + if (IsVRematOK && IsSRematOK) + IsUpdated = true; + else if (IsSGPRSpill) + IsUpdated = true; + } else if (NeedSRemat) { + if (IsSRematOK) + IsUpdated = true; + } else if (NeedVRemat) { + if (IsVRematOK) + IsUpdated = true; + } + // TODO: what to do when cannot reach target? + if (NewRematSCnt > 0) { + if ((unsigned)NewRematSCnt <= ST->getSGPRAllocGranule()) { + IsNearTarget = true; + } else { + if (!IsSGPRSpill) + return false; + } + } + } + + if (SRematMap.empty() && VRematMap.empty()) { + return IsUpdated; + } + + if (!SRematMap.empty()) { + IsUpdated = true; + applyRemat(SRematMap, HotBlocks, DT, SlotIndexes, MRI, SIRI, SIII, LIS, MF); + LLVM_DEBUG(llvm::dbgs() << "after hotremat"; MF.print(dbgs());); + } + + // Balance between vector and scalar if possible. + return IsUpdated; +} + +bool AMDGPUHotBlockRematerialize::runOnMachineFunction(MachineFunction &MF) { + if (MF.size() < 2) + return false; + LiveIntervals *LIS = &getAnalysis().getLIS(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); + MachinePostDominatorTree *PDT = + &getAnalysis().getPostDomTree(); + MachineLoopInfo *MLI = &getAnalysis().getLI(); + + bool IsNearTarget = false; + return hotBlockRemat(MF, MLI, LIS, DT, PDT, IsNearTarget); +} + +} // namespace + +INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) +INITIALIZE_PASS_END(AMDGPUHotBlockRematerialize, DEBUG_TYPE, + "AMDGPU rematerialize", false, false) + +char AMDGPUHotBlockRematerialize::ID = 0; +char &llvm::AMDGPUHotBlockRematerializeID = AMDGPUHotBlockRematerialize::ID; + +FunctionPass *llvm::createAMDGPUHotBlockRematerializePass() { + return new AMDGPUHotBlockRematerialize(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp new file mode 100644 index 0000000000000..4c55d172018d4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.cpp @@ -0,0 +1,254 @@ +//===------- AMDGPUMIRUtils.cpp - Helpers for MIR passes ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMIRUtils.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" + +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define DEBUG_TYPE "xb-mir-util" +using namespace llvm; + +namespace llvm { +bool getNonDebugMBBEnd(MachineBasicBlock::reverse_iterator &BBEnd, + MachineBasicBlock &MBB) { + // R.End doesn't point to the boundary instruction. + // Skip Debug instr. + while (BBEnd != MBB.rend() && BBEnd->isDebugInstr()) + BBEnd++; + return BBEnd != MBB.rend(); +} +} // namespace llvm + +namespace { + +// LoopInfo contains a mapping from basic block to the innermost loop. Find +// the outermost loop in the loop nest that contains BB. +const MachineLoop *getOutermostLoop(const MachineLoopInfo *LI, + const MachineBasicBlock *BB) { + const MachineLoop *L = LI->getLoopFor(BB); + if (L) { + while (const MachineLoop *Parent = L->getParentLoop()) + L = Parent; + } + return L; +} + +bool loopContainsBoth(const MachineLoopInfo *LI, const MachineBasicBlock *BB1, + const MachineBasicBlock *BB2) { + const MachineLoop *L1 = getOutermostLoop(LI, BB1); + const MachineLoop *L2 = getOutermostLoop(LI, BB2); + return L1 != nullptr && L1 == L2; +} + +} // namespace + +namespace llvm { + +bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS) { + if (!LIS) + return true; + const TargetRegisterInfo *TRI = MI.getMF()->getSubtarget().getRegisterInfo(); + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( + MachineBasicBlock *MBB, MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, const SIInstrInfo *TII, + MachineRegisterInfo *MRI, LiveIntervals *LIS, + SccDefInsertPointConstraintFlags Constraints) { + // If SCC is dead at MI when we can use MI as the insert point. + if (!llvm::isSccLiveAt(*MI, LIS)) + return MI; + + const bool CheckForExecWrite = + Constraints & SccDefInsertPointConstraintFlags::NoExecWrite; + + MachineBasicBlock::reverse_iterator Start = MI.getReverse(); + + // Otherwise, walk backwards through the block looking for a location where + // SCC is dead. + for (MachineBasicBlock::reverse_iterator It = Start, End = MBB->rend(); + It != End; ++It) { + // If the instruction modifies exec then we cannot use it as + // an insertion point (if that is a constraint from the caller). + // The check for EXEC works for both wave64 and wave32 because + // it will also catch Writes to the subregisters (e.g. exec_lo). + if (CheckForExecWrite && It->modifiesRegister(AMDGPU::EXEC, TRI)) + break; + + if (!llvm::isSccLiveAt(*It, LIS)) + return It->getIterator(); + } + + // If no safe location can be found in the block we can save and restore + // SCC around MI. There is no way to directly read or Write SCC so we use + // s_cselect to read the current value of SCC and s_cmp to Write the saved + // value back to SCC. + // + // The generated code will look like this; + // + // %SavedSCC = COPY $scc # Save SCC + // <----- Newly created safe insert point. + // MI + // $scc = COPY %SavedSCC # Restore SCC + // + Register TmpScc = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + DebugLoc DL = MI->getDebugLoc(); + auto CopyFrom = + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), TmpScc).addReg(AMDGPU::SCC); + auto CopyTo = BuildMI(*MBB, std::next(MI->getIterator()), DL, + TII->get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(TmpScc); + + // Cut the live segment. + auto SlotIndexes = LIS->getSlotIndexes(); + SlotIndexes->insertMachineInstrInMaps(*CopyFrom); + SlotIndexes->insertMachineInstrInMaps(*CopyTo); + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + auto OldSegment = *LR.getSegmentContaining(LIS->getInstructionIndex(*MI)); + LiveRange::Segment NewSegA( + OldSegment.start, + SlotIndexes->getInstructionIndex(*CopyFrom).getRegSlot(), + OldSegment.valno); + LiveRange::Segment NewSegB(LIS->getInstructionIndex(*CopyTo).getRegSlot(), + OldSegment.end, OldSegment.valno); + LR.removeSegment(OldSegment); + LR.addSegment(NewSegA); + LR.addSegment(NewSegB); + + return MI; +} + +void dumpLiveSet(const LiveSet &LiveSet, const SIRegisterInfo *SIRI) { + + dbgs() << "\n live set: \n"; + for (auto It : LiveSet) { + int Reg = It.first; + dbgs() << printReg(Reg, SIRI); + if (It.second.any()) + dbgs() << " mask:" << It.second.getAsInteger(); + dbgs() << "\n"; + } +} + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI) { + unsigned Size = SIRI->getRegSizeInBits(*MRI.getRegClass(Reg)); + Size >>= 5; + if (Mask.any()) { + if (unsigned MaskSize = Mask.getNumLanes()) { + if (MaskSize < Size) + Size = MaskSize; + } + } + return Size; +} + +void collectLiveSetPressure(const LiveSet &LiveSet, + const MachineRegisterInfo &MRI, + const SIRegisterInfo *SIRI, unsigned &VPressure, + unsigned &SPressure) { + VPressure = 0; + SPressure = 0; + for (auto LiveIt : LiveSet) { + unsigned Reg = LiveIt.first; + unsigned Size = getRegSize(Reg, LiveIt.second, MRI, SIRI); + if (SIRI->isVGPR(MRI, Reg)) + VPressure += Size; + else + SPressure += Size; + } +} + +bool isSub0Sub1SingleDef(unsigned Reg, const MachineRegisterInfo &MRI) { + // Support multi def for pattern of pointer: + // undef_ %808.sub0:sgpr_64 = COPY killed %795:sgpr_32 + // %808.sub1:sgpr_64 = S_MOV_B32 0 + bool HasSub0 = false; + bool HasSub1 = false; + for (MachineOperand &UserDefMO : MRI.def_operands(Reg)) { + if (unsigned SubReg = UserDefMO.getSubReg()) { + bool IsSingleSubReg = false; + switch (SubReg) { + default: + break; + case AMDGPU::sub0: + if (!HasSub0) { + HasSub0 = true; + IsSingleSubReg = true; + } + break; + case AMDGPU::sub1: + if (!HasSub1) { + HasSub1 = true; + IsSingleSubReg = true; + } + break; + } + if (!IsSingleSubReg) { + HasSub0 = false; + break; + } + } else { + HasSub0 = false; + break; + } + } + + return (HasSub0 && HasSub1); +} + +bool reach_block(MachineBasicBlock *FromBB, MachineDominatorTree *DT, + MachinePostDominatorTree *PDT, MachineLoopInfo *LI, + MachineBasicBlock *ToBB) { + if (FromBB == ToBB) + return true; + + if (DT->dominates(FromBB, ToBB)) + return true; + + if (PDT->dominates(ToBB, FromBB)) + return true; + + if (loopContainsBoth(LI, ToBB, FromBB)) + return true; + + // TODO: cover case hotBB in loop, + // one block in that loop dom BB or + // BB post dom one block in that loop. + return false; +} +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h new file mode 100644 index 0000000000000..14cd350398f4c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRUtils.h @@ -0,0 +1,94 @@ +//===------- AMDGPUMIRUtils.h - Helpers for MIR passes --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for MIR passes. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMIRUTILS_H + +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" + +namespace llvm { + +class LiveInterval; +class LiveIntervals; +class SlotIndexes; +class MachineRegisterInfo; +class SIRegisterInfo; +class SIInstrInfo; +class MachineDominatorTree; +class MachinePostDominatorTree; + +constexpr unsigned RegForVCC = 2; + +bool getNonDebugMBBEnd(llvm::MachineBasicBlock::reverse_iterator &BBEnd, + llvm::MachineBasicBlock &MBB); + +bool isSub0Sub1SingleDef(unsigned Reg, const llvm::MachineRegisterInfo &MRI); + +using LiveSet = llvm::DenseMap; +void dumpLiveSet(const LiveSet &LiveSet, const llvm::SIRegisterInfo *SIRI); + +bool isSccLiveAt(const MachineInstr &MI, LiveIntervals *LIS); + +// An enum used to pass additional constraints to +// `FindOrCreateInsertionPointForSccDef()`. This will further +// constrain the location where the scc def can be inserted. +enum SccDefInsertPointConstraintFlags { + None = 0, // No additional constraints. + NoExecWrite = 1, // Should be no modification of exec between BeforeInst and + // insert point. +}; + +// Look for a safe place to insert an instruction that defines scc. +// +// +// This function is useful for when we need to insert a new +// instruction that defines scc in a block and we need to find +// a location that will not smash the existing value. +// +// Starting at `BeforeInst` it will look backwards to try to find +// a place in the block where scc is dead so we can insert our new +// def there. If no location can be found it will save and restore +// scc around BeforeInst. This way BeforeInst can safely be used +// as the new insert location. +// +llvm::MachineBasicBlock::iterator findOrCreateInsertionPointForSccDef( + llvm::MachineBasicBlock *MBB, llvm::MachineBasicBlock::iterator BeforeInst, + const llvm::TargetRegisterInfo *TRI, const llvm::SIInstrInfo *TII, + llvm::MachineRegisterInfo *MRI, LiveIntervals *LIS, + SccDefInsertPointConstraintFlags Constraints = + SccDefInsertPointConstraintFlags::None); + +// For inst like S_BUFFER_LOAD_DWORDX16, change to S_BUFFER_LOAD_DWORDX4 if only +// used 4 lanes. +bool removeUnusedLanes(llvm::MachineInstr &MI, llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *TRI, + const llvm::SIInstrInfo *TII, + llvm::SlotIndexes *SlotIndexes); + +unsigned getRegSize(unsigned Reg, llvm::LaneBitmask &Mask, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI); +void collectLiveSetPressure(const LiveSet &LiveSet, + const llvm::MachineRegisterInfo &MRI, + const llvm::SIRegisterInfo *SIRI, + unsigned &VPressure, unsigned &SPressure); + +bool reach_block(llvm::MachineBasicBlock *FromBB, + llvm::MachineDominatorTree *DT, + llvm::MachinePostDominatorTree *PDT, llvm::MachineLoopInfo *LI, + llvm::MachineBasicBlock *ToBB); +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp new file mode 100644 index 0000000000000..6160fe5471376 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.cpp @@ -0,0 +1,162 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==------------------------------------------------------------------------==// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//==------------------------------------------------------------------------==// + +#include "AMDGPUOccupancyAndLatencyHelper.h" +#include "GCNSubtarget.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +#include + +namespace llvm { + +void SchedScore::sum(const SchedScore &S, unsigned LoopDepth) { + unsigned LoopCount = LoopDepth > 0 ? std::pow(3, LoopDepth) : 1; + LatencyHide += LoopCount * S.LatencyHide; + MemLatency += LoopCount * S.MemLatency; + MixAlu += LoopCount * S.MixAlu; + Alu += LoopCount * S.Alu; + Lds += LoopCount * S.Lds; + SgprSpill |= S.SgprSpill; +} +// Does more occupancy give more perf. +bool SchedScore::isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc) const { + unsigned Gain = latencyGain(TargetOccupancy, ExtraOcc); + // 10% is good enough. + if ((10 * Gain) >= Alu) + return true; + return false; +} + +unsigned SchedScore::latencyGain(unsigned TgtOcc, unsigned ExtraOcc) const { + unsigned Latency = MemLatency; + return (Latency / (TgtOcc)) - (Latency / (TgtOcc + ExtraOcc)); +} + +// AMDGPULatencyTracker +AMDGPULatencyTracker::AMDGPULatencyTracker(const GCNSubtarget &ST) + : SIII(ST.getInstrInfo()), ItinerayData(ST.getInstrItineraryData()) {} + +void AMDGPULatencyTracker::scan(const MachineInstr &MI) { + if (MI.isDebugInstr()) + return; + int Latency = SIII->getInstrLatency(ItinerayData, MI); + // If inside latency hide. + if (!LatencyMIs.empty()) { + bool IsWaitCnt = false; + for (auto &MO : MI.operands()) { + if (MO.isReg()) { + Register Reg = MO.getReg(); + auto It = LatencyMIs.find(Reg); + if (It != LatencyMIs.end()) { + IsWaitCnt = true; + // If MI use mem result, update latency to mem latency. + int Cycle = It->second; + if (Cycle > Latency) + Latency = Cycle; + } + } + } + // Update latency for each mem latency inst. + for (auto It = LatencyMIs.begin(); It != LatencyMIs.end();) { + auto Prev = It; + auto L = (It++); + int Cycle = L->second; + if (Cycle <= Latency) { + // Only left cycles. + // Remove the reg. + LatencyMIs.erase(Prev); + if (IsWaitCnt && Cycle == Latency) { + Score.MemLatency += Cycle; + // Only count memLatency once, the rest is hide. + IsWaitCnt = false; + } else { + // Hide cycle or count mem latency? + Score.LatencyHide += Cycle; + } + } else { + L->second -= Latency; + // Hide latency. + Score.LatencyHide += Latency; + } + } + + } else { + // TODO: check branch/lds? + // TODO: check prevVAlu? + auto GetAluStatus = [](const MachineInstr &MI, + const llvm::SIInstrInfo *SIII) { + AluStatus Status = AluStatus::Nothing; + if (SIII->isVALU(MI.getOpcode())) + Status = AluStatus::Vector; + else if (SIII->isSALU(MI.getOpcode())) + Status = AluStatus::Scalar; + return Status; + }; + AluStatus Status = GetAluStatus(MI, SIII); + + switch (PrevStatus) { + case AluStatus::Nothing: { + Score.Alu += Latency; + Score.MixAlu += Latency; + PrevStatus = Status; + } break; + case AluStatus::Vector: + case AluStatus::Scalar: { + Score.Alu += Latency; + // Ignore mix alu. + if (PrevStatus != Status) + PrevStatus = AluStatus::Nothing; + else + Score.MixAlu += Latency; + } break; + } + } + // Update latency inst. + if (SIII->isHighLatencyDef(MI.getOpcode()) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kHighLetency = 180; + LatencyMIs[Reg] = kHighLetency; + } else if (SIII->isLowLatencyInstruction(MI) && MI.mayLoad()) { + Register Reg = MI.getOperand(0).getReg(); + // TODO: get correct latency. + // SIII->getInstrLatency(ItinerayData, MI); + constexpr unsigned kLowLetency = 35; + LatencyMIs[Reg] = kLowLetency; + } +} + +SchedScore collectLatency(MachineFunction &MF, const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI) { + SchedScore TotalScore; + for (auto &MFI : MF) { + MachineBasicBlock &MBB = MFI; + MachineBasicBlock::iterator Next; + AMDGPULatencyTracker LatencyTracker(ST); + for (auto &MI : MBB) + LatencyTracker.scan(MI); + unsigned LoopDepth = 0; + if (MLI) + LoopDepth = MLI->getLoopDepth(&MBB); + TotalScore.sum(LatencyTracker.Score, LoopDepth); + } + return TotalScore; +} + +} // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h new file mode 100644 index 0000000000000..9c63fa7e6b4a4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUOccupancyAndLatencyHelper.h @@ -0,0 +1,75 @@ +//==- AMDGPUOccupancyAndLatencyHelper.cpp - Helpers for occupancy + latency ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Helper functions for occupancy and latency. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUOCCUPANCYANDLATENCYHELPER_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/MC/MCInstrItineraries.h" + +namespace llvm { + +class MachineInstr; +class MachineFunction; +class GCNSubtarget; +class MachineLoopInfo; +class SIInstrInfo; + +struct SchedScore { + // Score for this Sched result. + unsigned Occupancy = 0; + bool SgprSpill = false; + unsigned LatencyHide = 0; // Only latency hide will split 2 load into 2 pass? + unsigned MemLatency = 0; // Only save mem latency. + // We want mem latency small and hide big. Compare + // memLatency - hide * Occ, smaller is better. + unsigned MixAlu = 0; // VAlu and SAlu can running parallel if Occ > 1. + unsigned Alu = 0; // avoid sequence of s_alu inst count less then occupancy. + unsigned Lds = 0; // Todo: count lds. + SchedScore() {} + + void sum(const SchedScore &S, unsigned LoopDepth = 0); + bool isMemBound(unsigned TargetOccupancy, unsigned ExtraOcc = 1) const; + // More latency can be hiden with ExtraOcc. + unsigned latencyGain(unsigned TargetOccupancy, unsigned ExtraOcc) const; +}; + +struct AMDGPULatencyTracker { + AMDGPULatencyTracker(const llvm::GCNSubtarget &ST); + const llvm::SIInstrInfo *SIII; + const llvm::InstrItineraryData *ItinerayData; + // Latency MI dst reg to cycle map. + llvm::DenseMap LatencyMIs; + SchedScore Score; + // Low latency MI not wait. + unsigned HideLatency = 0; + unsigned MemLatency = 0; + // For simple, only consider mixture as one valu one salu. + // Not group now. + unsigned PrevSAlu = 0; + unsigned PrevVAlu = 0; + enum class AluStatus { + Nothing, + Vector, + Scalar, + } PrevStatus = AluStatus::Nothing; + void scan(const llvm::MachineInstr &MI); +}; + +SchedScore collectLatency(llvm::MachineFunction &MF, + const llvm::GCNSubtarget &ST, + const llvm::MachineLoopInfo *MLI = nullptr); + +} // namespace llvm +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 90e3489ced923..9c1aec6cd047d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -397,6 +397,12 @@ static cl::opt cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable Hot block rematerialize +static cl::opt + EnableHotBlockRemat("amdgpu-enable-hot-block-remat", + cl::desc("Enable HotBlock Rematerialize optimization"), + cl::init(false), cl::Hidden); + // Enable GFX11+ VOPD static cl::opt EnableVOPD("amdgpu-enable-vopd", @@ -521,6 +527,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); + initializeAMDGPUHotBlockRematerializePass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); @@ -1539,6 +1546,10 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (TM->getOptLevel() > CodeGenOptLevel::Less) insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + // Rematerialize must be run before phi elimination + if (isPassEnabled(EnableHotBlockRemat)) + addPass(&AMDGPUHotBlockRematerializeID); + TargetPassConfig::addOptimizedRegAlloc(); } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 09a3096602fc3..79fdbba1d0db1 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUFrameLowering.cpp AMDGPUGlobalISelDivergenceLowering.cpp AMDGPUGlobalISelUtils.cpp + AMDGPUHotBlockRematerialize.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInsertDelayAlu.cpp AMDGPUInstCombineIntrinsic.cpp @@ -81,10 +82,12 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp + AMDGPUMIRUtils.cpp AMDGPUIGroupLP.cpp AMDGPUMCResourceInfo.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp + AMDGPUOccupancyAndLatencyHelper.cpp AMDGPUPerfHintAnalysis.cpp AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index f74d12cfab0c0..7f76d14eb9ab0 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -549,22 +549,26 @@ bool GCNDownwardRPTracker::advanceBeforeNext(MachineInstr *MI, if (!S.liveAt(SI)) { if (It == LiveRegs.end()) { It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) + if (!MRI->isSSA() && It == LiveRegs.end()) llvm_unreachable("register isn't live"); } - auto PrevMask = It->second; - It->second &= ~S.LaneMask; - CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + if (It != LiveRegs.end()) { + auto PrevMask = It->second; + It->second &= ~S.LaneMask; + CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); + } } } if (It != LiveRegs.end() && It->second.none()) LiveRegs.erase(It); } else if (!LI.liveAt(SI)) { auto It = LiveRegs.find(MO.getReg()); - if (It == LiveRegs.end()) + if (!MRI->isSSA() && It == LiveRegs.end()) llvm_unreachable("register isn't live"); - CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); - LiveRegs.erase(It); + if (It != LiveRegs.end()) { + CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); + LiveRegs.erase(It); + } } } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 7554b9f578fcb..aa4b3f948b726 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -47,6 +47,10 @@ struct GCNRegPressure { void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + unsigned getMaxSGPR() const { + return std::max(getSGPRNum(), getSGPRTuplesWeight()); + } + /// \returns the SGPR32 pressure unsigned getSGPRNum() const { return Value[SGPR32]; } /// \returns the aggregated ArchVGPR32, AccVGPR32 pressure dependent upon \p diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir new file mode 100644 index 0000000000000..d6c6173cd523e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr.mir @@ -0,0 +1,179 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# Check that the loads have been moved to the use +# CHECK: bb.0: +# CHECK-NOT: S_LOAD_DWORDX4_IMM +# CHECK: bb.2: +# CHECK: %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0 +# CHECK: KILL %t0 +# CHECK: %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0 +# CHECK: KILL %t2 +# CHECK: %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0 +# CHECK: KILL %t4 +# CHECK: %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0 +# CHECK: KILL %t6 +# CHECK: %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0 +# CHECK: KILL %t8 +# CHECK: %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0 +# CHECK: KILL %t10 +# CHECK: %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0 +# CHECK: KILL %t12 +# CHECK: %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0 +# CHECK: KILL %t14 +# CHECK: %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0 +# CHECK: KILL %t16 +# CHECK: %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0 +# CHECK: KILL %t18 +# CHECK: %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0 +# CHECK: KILL %t20 +# CHECK: %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0 +# CHECK: KILL %t22 +# CHECK: %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0 +# CHECK: KILL %t24 +# CHECK: %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0 +# CHECK: KILL %t26 +# CHECK: %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0 +# CHECK: KILL %t28 +# CHECK: %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0 +# CHECK: KILL %t30 +# CHECK: %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0 +# CHECK: KILL %t32 +# CHECK: %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0 +# CHECK: KILL %t34 +# CHECK: %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0 +# CHECK: KILL %t36 +# CHECK: %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0 +# CHECK: KILL %t38 +# CHECK: %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0 +# CHECK: KILL %t40 +# CHECK: %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0 +# CHECK: KILL %t42 +# CHECK: %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0 +# CHECK: KILL %t44 +# CHECK: %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0 +# CHECK: KILL %t46 +# CHECK: %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0 +# CHECK: KILL %t48 +# CHECK: %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0 +# CHECK: KILL %t50 +# CHECK: %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0 +# CHECK: KILL %t52 +# CHECK: %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0 +# CHECK: KILL %t54 +# CHECK: %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0 +# CHECK: KILL %t56 +# CHECK: %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0 +# CHECK: KILL %t58 +# CHECK: %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0 +# CHECK: KILL %t60 +# CHECK: %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0 +# CHECK: KILL %t62 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + %ptr:sgpr_64 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 0, 0 + %t2:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 16, 0 + %t4:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 32, 0 + %t6:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 48, 0 + %t8:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 64, 0 + %t10:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 80, 0 + %t12:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 96, 0 + %t14:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 112, 0 + %t16:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 128, 0 + %t18:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 144, 0 + %t20:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 160, 0 + %t22:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 176, 0 + %t24:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 192, 0 + %t26:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 208, 0 + %t28:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 224, 0 + %t30:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 240, 0 + %t32:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 256, 0 + %t34:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 272, 0 + %t36:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 288, 0 + %t38:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 304, 0 + %t40:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 320, 0 + %t42:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 336, 0 + %t44:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 352, 0 + %t46:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 368, 0 + %t48:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 384, 0 + %t50:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 400, 0 + %t52:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 416, 0 + %t54:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 432, 0 + %t56:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 448, 0 + %t58:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 464, 0 + %t60:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 480, 0 + %t62:sgpr_128 = S_LOAD_DWORDX4_IMM %ptr, 496, 0 + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir new file mode 100644 index 0000000000000..a4e9c69d53b7c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_long_scc.mir @@ -0,0 +1,575 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that when there are no safe spot to clone/move instructions that +# modify $scc, a safe spot is created for it. + +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# Save scc +# CHECK: %[[#scc0:]]:sreg_32_xm0 = COPY $scc +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: KILL %t0 +# All subsequent moves are placed within the safe spot created for the first one. +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# Restore scc +# CHECK: $scc = COPY %[[#scc0]] +# CHECK: KILL %t2 +# CHECK: KILL %t4 +# CHECK: KILL %t6 +# CHECK: KILL %t8 +# CHECK: KILL %t10 +# CHECK: KILL %t12 +# CHECK: KILL %t14 +# CHECK: KILL %t16 +# CHECK: KILL %t18 +# CHECK: KILL %t20 +# CHECK: KILL %t22 +# CHECK: KILL %t24 +# CHECK: KILL %t26 +# CHECK: KILL %t28 +# CHECK: KILL %t30 +# CHECK: KILL %t32 +# CHECK: KILL %t34 +# CHECK: KILL %t36 +# CHECK: KILL %t38 +# CHECK: KILL %t40 +# CHECK: KILL %t42 +# CHECK: KILL %t44 +# CHECK: KILL %t46 +# CHECK: KILL %t48 +# CHECK: KILL %t50 +# CHECK: KILL %t52 +# CHECK: KILL %t54 +# CHECK: KILL %t56 +# CHECK: KILL %t58 +# CHECK: KILL %t60 +# CHECK: KILL %t62 +# CHECK: KILL %t64 +# CHECK: KILL %t66 +# CHECK: KILL %t68 +# CHECK: KILL %t70 +# CHECK: KILL %t72 +# CHECK: KILL %t74 +# CHECK: KILL %t76 +# CHECK: KILL %t78 +# CHECK: KILL %t80 +# CHECK: KILL %t82 +# CHECK: KILL %t84 +# CHECK: KILL %t86 +# CHECK: KILL %t88 +# CHECK: KILL %t90 +# CHECK: KILL %t92 +# CHECK: KILL %t94 +# CHECK: KILL %t96 +# CHECK: KILL %t98 +# CHECK: KILL %t100 +# CHECK: KILL %t102 +# CHECK: KILL %t104 +# CHECK: KILL %t106 +# CHECK: KILL %t108 +# CHECK: KILL %t110 +# CHECK: KILL %t112 +# CHECK: KILL %t114 +# CHECK: KILL %t116 +# CHECK: KILL %t118 +# CHECK: KILL %t120 +# CHECK: KILL %t122 +# CHECK: KILL %t124 +# CHECK: KILL %t126 +# CHECK: KILL %t128 +# CHECK: KILL %t130 +# CHECK: KILL %t132 +# CHECK: KILL %t134 +# CHECK: KILL %t136 +# CHECK: KILL %t138 +# CHECK: KILL %t140 +# CHECK: KILL %t142 +# CHECK: KILL %t144 +# CHECK: KILL %t146 +# CHECK: KILL %t148 +# CHECK: KILL %t150 +# CHECK: KILL %t152 +# CHECK: KILL %t154 +# CHECK: KILL %t156 +# CHECK: KILL %t158 +# CHECK: KILL %t160 +# CHECK: KILL %t162 +# CHECK: KILL %t164 +# CHECK: KILL %t166 +# CHECK: KILL %t168 +# CHECK: KILL %t170 +# CHECK: KILL %t172 +# CHECK: KILL %t174 +# CHECK: KILL %t176 +# CHECK: KILL %t178 +# CHECK: KILL %t180 +# CHECK: KILL %t182 +# CHECK: KILL %t184 +# CHECK: KILL %t186 +# CHECK: KILL %t188 +# CHECK: KILL %t190 +# CHECK: KILL %t192 +# CHECK: KILL %t194 +# CHECK: KILL %t196 +# CHECK: KILL %t198 +# CHECK: KILL %t200 +# CHECK: KILL %t202 +# CHECK: KILL %t204 +# CHECK: KILL %t206 +# CHECK: KILL %t208 +# CHECK: KILL %t210 +# CHECK: KILL %t212 +# CHECK: KILL %t214 +# CHECK: KILL %t216 +# CHECK: KILL %t218 +# CHECK: KILL %t220 +# CHECK: KILL %t222 +# CHECK: KILL %t224 +# CHECK: KILL %t226 +# CHECK: KILL %t228 +# CHECK: KILL %t230 +# CHECK: KILL %t232 +# CHECK: KILL %t234 +# CHECK: KILL %t236 +# CHECK: KILL %t238 +# CHECK: KILL %t240 +# CHECK: KILL %t242 +# CHECK: KILL %t244 +# CHECK: KILL %t246 +# CHECK: KILL %t248 +# CHECK: KILL %t250 +# CHECK: KILL %t252 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Def scc + $scc = IMPLICIT_DEF + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $scc + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + liveins: $scc + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + KILL $scc + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir new file mode 100644 index 0000000000000..39d21dbda3819 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_no_scc.mir @@ -0,0 +1,564 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that scalar instructions that define $scc are not sunk into ranges where $scc is live +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# CHECK: KILL %t0 +# CHECK: KILL %t2 +# CHECK: KILL %t4 +# CHECK: KILL %t6 +# CHECK: KILL %t8 +# CHECK: KILL %t10 +# CHECK: KILL %t12 +# CHECK: KILL %t14 +# CHECK: KILL %t16 +# CHECK: KILL %t18 +# CHECK: KILL %t20 +# CHECK: KILL %t22 +# CHECK: KILL %t24 +# CHECK: KILL %t26 +# CHECK: KILL %t28 +# CHECK: KILL %t30 +# CHECK: KILL %t32 +# CHECK: KILL %t34 +# CHECK: KILL %t36 +# CHECK: KILL %t38 +# CHECK: KILL %t40 +# CHECK: KILL %t42 +# CHECK: KILL %t44 +# CHECK: KILL %t46 +# CHECK: KILL %t48 +# CHECK: KILL %t50 +# CHECK: KILL %t52 +# CHECK: KILL %t54 +# CHECK: KILL %t56 +# CHECK: KILL %t58 +# CHECK: KILL %t60 +# CHECK: KILL %t62 +# CHECK: KILL %t64 +# CHECK: KILL %t66 +# CHECK: KILL %t68 +# CHECK: KILL %t70 +# CHECK: KILL %t72 +# CHECK: KILL %t74 +# CHECK: KILL %t76 +# CHECK: KILL %t78 +# CHECK: KILL %t80 +# CHECK: KILL %t82 +# CHECK: KILL %t84 +# CHECK: KILL %t86 +# CHECK: KILL %t88 +# CHECK: KILL %t90 +# CHECK: KILL %t92 +# CHECK: KILL %t94 +# CHECK: KILL %t96 +# CHECK: KILL %t98 +# CHECK: KILL %t100 +# CHECK: KILL %t102 +# CHECK: KILL %t104 +# CHECK: KILL %t106 +# CHECK: KILL %t108 +# CHECK: KILL %t110 +# CHECK: KILL %t112 +# CHECK: KILL %t114 +# CHECK: KILL %t116 +# CHECK: KILL %t118 +# CHECK: KILL %t120 +# CHECK: KILL %t122 +# CHECK: KILL %t124 +# CHECK: KILL %t126 +# CHECK: KILL %t128 +# CHECK: KILL %t130 +# CHECK: KILL %t132 +# CHECK: KILL %t134 +# CHECK: KILL %t136 +# CHECK: KILL %t138 +# CHECK: KILL %t140 +# CHECK: KILL %t142 +# CHECK: KILL %t144 +# CHECK: KILL %t146 +# CHECK: KILL %t148 +# CHECK: KILL %t150 +# CHECK: KILL %t152 +# CHECK: KILL %t154 +# CHECK: KILL %t156 +# CHECK: KILL %t158 +# CHECK: KILL %t160 +# CHECK: KILL %t162 +# CHECK: KILL %t164 +# CHECK: KILL %t166 +# CHECK: KILL %t168 +# CHECK: KILL %t170 +# CHECK: KILL %t172 +# CHECK: KILL %t174 +# CHECK: KILL %t176 +# CHECK: KILL %t178 +# CHECK: KILL %t180 +# CHECK: KILL %t182 +# CHECK: KILL %t184 +# CHECK: KILL %t186 +# CHECK: KILL %t188 +# CHECK: KILL %t190 +# CHECK: KILL %t192 +# CHECK: KILL %t194 +# CHECK: KILL %t196 +# CHECK: KILL %t198 +# CHECK: KILL %t200 +# CHECK: KILL %t202 +# CHECK: KILL %t204 +# CHECK: KILL %t206 +# CHECK: KILL %t208 +# CHECK: KILL %t210 +# CHECK: KILL %t212 +# CHECK: KILL %t214 +# CHECK: KILL %t216 +# CHECK: KILL %t218 +# CHECK: KILL %t220 +# CHECK: KILL %t222 +# CHECK: KILL %t224 +# CHECK: KILL %t226 +# CHECK: KILL %t228 +# CHECK: KILL %t230 +# CHECK: KILL %t232 +# CHECK: KILL %t234 +# CHECK: KILL %t236 +# CHECK: KILL %t238 +# CHECK: KILL %t240 +# CHECK: KILL %t242 +# CHECK: KILL %t244 +# CHECK: KILL %t246 +# CHECK: KILL %t248 +# CHECK: KILL %t250 +# CHECK: KILL %t252 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + $scc = IMPLICIT_DEF + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + KILL $scc + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir new file mode 100644 index 0000000000000..305bf87a6120e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_phi.mir @@ -0,0 +1,304 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test simply checks that GCNDownwardRPTracker does not crash when PHIs are present +# CHECK: S_ENDPGM + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + %s0:sgpr_32 = IMPLICIT_DEF + S_BRANCH %bb.2 + + bb.2: + %phi0:sgpr_32 = PHI %t0, %bb.0, %s0, %bb.1 + %phi2:sgpr_32 = PHI %t2, %bb.0, %s0, %bb.1 + %phi4:sgpr_32 = PHI %t4, %bb.0, %s0, %bb.1 + %phi6:sgpr_32 = PHI %t6, %bb.0, %s0, %bb.1 + %phi8:sgpr_32 = PHI %t8, %bb.0, %s0, %bb.1 + %phi10:sgpr_32 = PHI %t10, %bb.0, %s0, %bb.1 + %phi12:sgpr_32 = PHI %t12, %bb.0, %s0, %bb.1 + %phi14:sgpr_32 = PHI %t14, %bb.0, %s0, %bb.1 + %phi16:sgpr_32 = PHI %t16, %bb.0, %s0, %bb.1 + %phi18:sgpr_32 = PHI %t18, %bb.0, %s0, %bb.1 + %phi20:sgpr_32 = PHI %t20, %bb.0, %s0, %bb.1 + %phi22:sgpr_32 = PHI %t22, %bb.0, %s0, %bb.1 + %phi24:sgpr_32 = PHI %t24, %bb.0, %s0, %bb.1 + %phi26:sgpr_32 = PHI %t26, %bb.0, %s0, %bb.1 + %phi28:sgpr_32 = PHI %t28, %bb.0, %s0, %bb.1 + %phi30:sgpr_32 = PHI %t30, %bb.0, %s0, %bb.1 + %phi32:sgpr_32 = PHI %t32, %bb.0, %s0, %bb.1 + %phi34:sgpr_32 = PHI %t34, %bb.0, %s0, %bb.1 + %phi36:sgpr_32 = PHI %t36, %bb.0, %s0, %bb.1 + %phi38:sgpr_32 = PHI %t38, %bb.0, %s0, %bb.1 + %phi40:sgpr_32 = PHI %t40, %bb.0, %s0, %bb.1 + %phi42:sgpr_32 = PHI %t42, %bb.0, %s0, %bb.1 + %phi44:sgpr_32 = PHI %t44, %bb.0, %s0, %bb.1 + %phi46:sgpr_32 = PHI %t46, %bb.0, %s0, %bb.1 + %phi48:sgpr_32 = PHI %t48, %bb.0, %s0, %bb.1 + %phi50:sgpr_32 = PHI %t50, %bb.0, %s0, %bb.1 + %phi52:sgpr_32 = PHI %t52, %bb.0, %s0, %bb.1 + %phi54:sgpr_32 = PHI %t54, %bb.0, %s0, %bb.1 + %phi56:sgpr_32 = PHI %t56, %bb.0, %s0, %bb.1 + %phi58:sgpr_32 = PHI %t58, %bb.0, %s0, %bb.1 + %phi60:sgpr_32 = PHI %t60, %bb.0, %s0, %bb.1 + %phi62:sgpr_32 = PHI %t62, %bb.0, %s0, %bb.1 + %phi64:sgpr_32 = PHI %t64, %bb.0, %s0, %bb.1 + %phi66:sgpr_32 = PHI %t66, %bb.0, %s0, %bb.1 + %phi68:sgpr_32 = PHI %t68, %bb.0, %s0, %bb.1 + %phi70:sgpr_32 = PHI %t70, %bb.0, %s0, %bb.1 + %phi72:sgpr_32 = PHI %t72, %bb.0, %s0, %bb.1 + %phi74:sgpr_32 = PHI %t74, %bb.0, %s0, %bb.1 + %phi76:sgpr_32 = PHI %t76, %bb.0, %s0, %bb.1 + %phi78:sgpr_32 = PHI %t78, %bb.0, %s0, %bb.1 + %phi80:sgpr_32 = PHI %t80, %bb.0, %s0, %bb.1 + %phi82:sgpr_32 = PHI %t82, %bb.0, %s0, %bb.1 + %phi84:sgpr_32 = PHI %t84, %bb.0, %s0, %bb.1 + %phi86:sgpr_32 = PHI %t86, %bb.0, %s0, %bb.1 + %phi88:sgpr_32 = PHI %t88, %bb.0, %s0, %bb.1 + %phi90:sgpr_32 = PHI %t90, %bb.0, %s0, %bb.1 + %phi92:sgpr_32 = PHI %t92, %bb.0, %s0, %bb.1 + %phi94:sgpr_32 = PHI %t94, %bb.0, %s0, %bb.1 + %phi96:sgpr_32 = PHI %t96, %bb.0, %s0, %bb.1 + %phi98:sgpr_32 = PHI %t98, %bb.0, %s0, %bb.1 + %phi100:sgpr_32 = PHI %t100, %bb.0, %s0, %bb.1 + %phi102:sgpr_32 = PHI %t102, %bb.0, %s0, %bb.1 + %phi104:sgpr_32 = PHI %t104, %bb.0, %s0, %bb.1 + %phi106:sgpr_32 = PHI %t106, %bb.0, %s0, %bb.1 + %phi108:sgpr_32 = PHI %t108, %bb.0, %s0, %bb.1 + %phi110:sgpr_32 = PHI %t110, %bb.0, %s0, %bb.1 + %phi112:sgpr_32 = PHI %t112, %bb.0, %s0, %bb.1 + %phi114:sgpr_32 = PHI %t114, %bb.0, %s0, %bb.1 + %phi116:sgpr_32 = PHI %t116, %bb.0, %s0, %bb.1 + %phi118:sgpr_32 = PHI %t118, %bb.0, %s0, %bb.1 + %phi120:sgpr_32 = PHI %t120, %bb.0, %s0, %bb.1 + %phi122:sgpr_32 = PHI %t122, %bb.0, %s0, %bb.1 + %phi124:sgpr_32 = PHI %t124, %bb.0, %s0, %bb.1 + %phi126:sgpr_32 = PHI %t126, %bb.0, %s0, %bb.1 + %phi128:sgpr_32 = PHI %t128, %bb.0, %s0, %bb.1 + %phi130:sgpr_32 = PHI %t130, %bb.0, %s0, %bb.1 + %phi132:sgpr_32 = PHI %t132, %bb.0, %s0, %bb.1 + %phi134:sgpr_32 = PHI %t134, %bb.0, %s0, %bb.1 + %phi136:sgpr_32 = PHI %t136, %bb.0, %s0, %bb.1 + %phi138:sgpr_32 = PHI %t138, %bb.0, %s0, %bb.1 + %phi140:sgpr_32 = PHI %t140, %bb.0, %s0, %bb.1 + %phi142:sgpr_32 = PHI %t142, %bb.0, %s0, %bb.1 + %phi144:sgpr_32 = PHI %t144, %bb.0, %s0, %bb.1 + %phi146:sgpr_32 = PHI %t146, %bb.0, %s0, %bb.1 + %phi148:sgpr_32 = PHI %t148, %bb.0, %s0, %bb.1 + %phi150:sgpr_32 = PHI %t150, %bb.0, %s0, %bb.1 + %phi152:sgpr_32 = PHI %t152, %bb.0, %s0, %bb.1 + %phi154:sgpr_32 = PHI %t154, %bb.0, %s0, %bb.1 + %phi156:sgpr_32 = PHI %t156, %bb.0, %s0, %bb.1 + %phi158:sgpr_32 = PHI %t158, %bb.0, %s0, %bb.1 + %phi160:sgpr_32 = PHI %t160, %bb.0, %s0, %bb.1 + %phi162:sgpr_32 = PHI %t162, %bb.0, %s0, %bb.1 + %phi164:sgpr_32 = PHI %t164, %bb.0, %s0, %bb.1 + %phi166:sgpr_32 = PHI %t166, %bb.0, %s0, %bb.1 + %phi168:sgpr_32 = PHI %t168, %bb.0, %s0, %bb.1 + %phi170:sgpr_32 = PHI %t170, %bb.0, %s0, %bb.1 + %phi172:sgpr_32 = PHI %t172, %bb.0, %s0, %bb.1 + %phi174:sgpr_32 = PHI %t174, %bb.0, %s0, %bb.1 + %phi176:sgpr_32 = PHI %t176, %bb.0, %s0, %bb.1 + %phi178:sgpr_32 = PHI %t178, %bb.0, %s0, %bb.1 + %phi180:sgpr_32 = PHI %t180, %bb.0, %s0, %bb.1 + %phi182:sgpr_32 = PHI %t182, %bb.0, %s0, %bb.1 + %phi184:sgpr_32 = PHI %t184, %bb.0, %s0, %bb.1 + %phi186:sgpr_32 = PHI %t186, %bb.0, %s0, %bb.1 + %phi188:sgpr_32 = PHI %t188, %bb.0, %s0, %bb.1 + %phi190:sgpr_32 = PHI %t190, %bb.0, %s0, %bb.1 + %phi192:sgpr_32 = PHI %t192, %bb.0, %s0, %bb.1 + %phi194:sgpr_32 = PHI %t194, %bb.0, %s0, %bb.1 + %phi196:sgpr_32 = PHI %t196, %bb.0, %s0, %bb.1 + %phi198:sgpr_32 = PHI %t198, %bb.0, %s0, %bb.1 + %phi200:sgpr_32 = PHI %t200, %bb.0, %s0, %bb.1 + %phi202:sgpr_32 = PHI %t202, %bb.0, %s0, %bb.1 + %phi204:sgpr_32 = PHI %t204, %bb.0, %s0, %bb.1 + %phi206:sgpr_32 = PHI %t206, %bb.0, %s0, %bb.1 + %phi208:sgpr_32 = PHI %t208, %bb.0, %s0, %bb.1 + %phi210:sgpr_32 = PHI %t210, %bb.0, %s0, %bb.1 + %phi212:sgpr_32 = PHI %t212, %bb.0, %s0, %bb.1 + %phi214:sgpr_32 = PHI %t214, %bb.0, %s0, %bb.1 + %phi216:sgpr_32 = PHI %t216, %bb.0, %s0, %bb.1 + %phi218:sgpr_32 = PHI %t218, %bb.0, %s0, %bb.1 + %phi220:sgpr_32 = PHI %t220, %bb.0, %s0, %bb.1 + %phi222:sgpr_32 = PHI %t222, %bb.0, %s0, %bb.1 + %phi224:sgpr_32 = PHI %t224, %bb.0, %s0, %bb.1 + %phi226:sgpr_32 = PHI %t226, %bb.0, %s0, %bb.1 + %phi228:sgpr_32 = PHI %t228, %bb.0, %s0, %bb.1 + %phi230:sgpr_32 = PHI %t230, %bb.0, %s0, %bb.1 + %phi232:sgpr_32 = PHI %t232, %bb.0, %s0, %bb.1 + %phi234:sgpr_32 = PHI %t234, %bb.0, %s0, %bb.1 + %phi236:sgpr_32 = PHI %t236, %bb.0, %s0, %bb.1 + %phi238:sgpr_32 = PHI %t238, %bb.0, %s0, %bb.1 + %phi240:sgpr_32 = PHI %t240, %bb.0, %s0, %bb.1 + %phi242:sgpr_32 = PHI %t242, %bb.0, %s0, %bb.1 + %phi244:sgpr_32 = PHI %t244, %bb.0, %s0, %bb.1 + %phi246:sgpr_32 = PHI %t246, %bb.0, %s0, %bb.1 + %phi248:sgpr_32 = PHI %t248, %bb.0, %s0, %bb.1 + %phi250:sgpr_32 = PHI %t250, %bb.0, %s0, %bb.1 + %phi252:sgpr_32 = PHI %t252, %bb.0, %s0, %bb.1 + %phi254:sgpr_32 = PHI %t254, %bb.0, %s0, %bb.1 + + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir new file mode 100644 index 0000000000000..94e86a61c09d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/remat/simple_sgpr_scc.mir @@ -0,0 +1,564 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs %s -o - -run-pass=amdgpu-hot-block-remat -amdgpu-remat-enable-hot-block-remat-aggressive-sgpr | FileCheck %s + +# This test checks that instructions that use $scc are sunk to users +# CHECK: bb.0: +# CHECK-NOT: S_NOT_B32: +# CHECK: bb.2: +# CHECK: %t0:sgpr_32 = S_NOT_B32 0 +# CHECK: KILL %t0 +# CHECK: %t2:sgpr_32 = S_NOT_B32 1 +# CHECK: KILL %t2 +# CHECK: %t4:sgpr_32 = S_NOT_B32 2 +# CHECK: KILL %t4 +# CHECK: %t6:sgpr_32 = S_NOT_B32 3 +# CHECK: KILL %t6 +# CHECK: %t8:sgpr_32 = S_NOT_B32 4 +# CHECK: KILL %t8 +# CHECK: %t10:sgpr_32 = S_NOT_B32 5 +# CHECK: KILL %t10 +# CHECK: %t12:sgpr_32 = S_NOT_B32 6 +# CHECK: KILL %t12 +# CHECK: %t14:sgpr_32 = S_NOT_B32 7 +# CHECK: KILL %t14 +# CHECK: %t16:sgpr_32 = S_NOT_B32 8 +# CHECK: KILL %t16 +# CHECK: %t18:sgpr_32 = S_NOT_B32 9 +# CHECK: KILL %t18 +# CHECK: %t20:sgpr_32 = S_NOT_B32 10 +# CHECK: KILL %t20 +# CHECK: %t22:sgpr_32 = S_NOT_B32 11 +# CHECK: KILL %t22 +# CHECK: %t24:sgpr_32 = S_NOT_B32 12 +# CHECK: KILL %t24 +# CHECK: %t26:sgpr_32 = S_NOT_B32 13 +# CHECK: KILL %t26 +# CHECK: %t28:sgpr_32 = S_NOT_B32 14 +# CHECK: KILL %t28 +# CHECK: %t30:sgpr_32 = S_NOT_B32 15 +# CHECK: KILL %t30 +# CHECK: %t32:sgpr_32 = S_NOT_B32 16 +# CHECK: KILL %t32 +# CHECK: %t34:sgpr_32 = S_NOT_B32 17 +# CHECK: KILL %t34 +# CHECK: %t36:sgpr_32 = S_NOT_B32 18 +# CHECK: KILL %t36 +# CHECK: %t38:sgpr_32 = S_NOT_B32 19 +# CHECK: KILL %t38 +# CHECK: %t40:sgpr_32 = S_NOT_B32 20 +# CHECK: KILL %t40 +# CHECK: %t42:sgpr_32 = S_NOT_B32 21 +# CHECK: KILL %t42 +# CHECK: %t44:sgpr_32 = S_NOT_B32 22 +# CHECK: KILL %t44 +# CHECK: %t46:sgpr_32 = S_NOT_B32 23 +# CHECK: KILL %t46 +# CHECK: %t48:sgpr_32 = S_NOT_B32 24 +# CHECK: KILL %t48 +# CHECK: %t50:sgpr_32 = S_NOT_B32 25 +# CHECK: KILL %t50 +# CHECK: %t52:sgpr_32 = S_NOT_B32 26 +# CHECK: KILL %t52 +# CHECK: %t54:sgpr_32 = S_NOT_B32 27 +# CHECK: KILL %t54 +# CHECK: %t56:sgpr_32 = S_NOT_B32 28 +# CHECK: KILL %t56 +# CHECK: %t58:sgpr_32 = S_NOT_B32 29 +# CHECK: KILL %t58 +# CHECK: %t60:sgpr_32 = S_NOT_B32 30 +# CHECK: KILL %t60 +# CHECK: %t62:sgpr_32 = S_NOT_B32 31 +# CHECK: KILL %t62 +# CHECK: %t64:sgpr_32 = S_NOT_B32 32 +# CHECK: KILL %t64 +# CHECK: %t66:sgpr_32 = S_NOT_B32 33 +# CHECK: KILL %t66 +# CHECK: %t68:sgpr_32 = S_NOT_B32 34 +# CHECK: KILL %t68 +# CHECK: %t70:sgpr_32 = S_NOT_B32 35 +# CHECK: KILL %t70 +# CHECK: %t72:sgpr_32 = S_NOT_B32 36 +# CHECK: KILL %t72 +# CHECK: %t74:sgpr_32 = S_NOT_B32 37 +# CHECK: KILL %t74 +# CHECK: %t76:sgpr_32 = S_NOT_B32 38 +# CHECK: KILL %t76 +# CHECK: %t78:sgpr_32 = S_NOT_B32 39 +# CHECK: KILL %t78 +# CHECK: %t80:sgpr_32 = S_NOT_B32 40 +# CHECK: KILL %t80 +# CHECK: %t82:sgpr_32 = S_NOT_B32 41 +# CHECK: KILL %t82 +# CHECK: %t84:sgpr_32 = S_NOT_B32 42 +# CHECK: KILL %t84 +# CHECK: %t86:sgpr_32 = S_NOT_B32 43 +# CHECK: KILL %t86 +# CHECK: %t88:sgpr_32 = S_NOT_B32 44 +# CHECK: KILL %t88 +# CHECK: %t90:sgpr_32 = S_NOT_B32 45 +# CHECK: KILL %t90 +# CHECK: %t92:sgpr_32 = S_NOT_B32 46 +# CHECK: KILL %t92 +# CHECK: %t94:sgpr_32 = S_NOT_B32 47 +# CHECK: KILL %t94 +# CHECK: %t96:sgpr_32 = S_NOT_B32 48 +# CHECK: KILL %t96 +# CHECK: %t98:sgpr_32 = S_NOT_B32 49 +# CHECK: KILL %t98 +# CHECK: %t100:sgpr_32 = S_NOT_B32 50 +# CHECK: KILL %t100 +# CHECK: %t102:sgpr_32 = S_NOT_B32 51 +# CHECK: KILL %t102 +# CHECK: %t104:sgpr_32 = S_NOT_B32 52 +# CHECK: KILL %t104 +# CHECK: %t106:sgpr_32 = S_NOT_B32 53 +# CHECK: KILL %t106 +# CHECK: %t108:sgpr_32 = S_NOT_B32 54 +# CHECK: KILL %t108 +# CHECK: %t110:sgpr_32 = S_NOT_B32 55 +# CHECK: KILL %t110 +# CHECK: %t112:sgpr_32 = S_NOT_B32 56 +# CHECK: KILL %t112 +# CHECK: %t114:sgpr_32 = S_NOT_B32 57 +# CHECK: KILL %t114 +# CHECK: %t116:sgpr_32 = S_NOT_B32 58 +# CHECK: KILL %t116 +# CHECK: %t118:sgpr_32 = S_NOT_B32 59 +# CHECK: KILL %t118 +# CHECK: %t120:sgpr_32 = S_NOT_B32 60 +# CHECK: KILL %t120 +# CHECK: %t122:sgpr_32 = S_NOT_B32 61 +# CHECK: KILL %t122 +# CHECK: %t124:sgpr_32 = S_NOT_B32 62 +# CHECK: KILL %t124 +# CHECK: %t126:sgpr_32 = S_NOT_B32 63 +# CHECK: KILL %t126 +# CHECK: %t128:sgpr_32 = S_NOT_B32 64 +# CHECK: KILL %t128 +# CHECK: %t130:sgpr_32 = S_NOT_B32 65 +# CHECK: KILL %t130 +# CHECK: %t132:sgpr_32 = S_NOT_B32 66 +# CHECK: KILL %t132 +# CHECK: %t134:sgpr_32 = S_NOT_B32 67 +# CHECK: KILL %t134 +# CHECK: %t136:sgpr_32 = S_NOT_B32 68 +# CHECK: KILL %t136 +# CHECK: %t138:sgpr_32 = S_NOT_B32 69 +# CHECK: KILL %t138 +# CHECK: %t140:sgpr_32 = S_NOT_B32 70 +# CHECK: KILL %t140 +# CHECK: %t142:sgpr_32 = S_NOT_B32 71 +# CHECK: KILL %t142 +# CHECK: %t144:sgpr_32 = S_NOT_B32 72 +# CHECK: KILL %t144 +# CHECK: %t146:sgpr_32 = S_NOT_B32 73 +# CHECK: KILL %t146 +# CHECK: %t148:sgpr_32 = S_NOT_B32 74 +# CHECK: KILL %t148 +# CHECK: %t150:sgpr_32 = S_NOT_B32 75 +# CHECK: KILL %t150 +# CHECK: %t152:sgpr_32 = S_NOT_B32 76 +# CHECK: KILL %t152 +# CHECK: %t154:sgpr_32 = S_NOT_B32 77 +# CHECK: KILL %t154 +# CHECK: %t156:sgpr_32 = S_NOT_B32 78 +# CHECK: KILL %t156 +# CHECK: %t158:sgpr_32 = S_NOT_B32 79 +# CHECK: KILL %t158 +# CHECK: %t160:sgpr_32 = S_NOT_B32 80 +# CHECK: KILL %t160 +# CHECK: %t162:sgpr_32 = S_NOT_B32 81 +# CHECK: KILL %t162 +# CHECK: %t164:sgpr_32 = S_NOT_B32 82 +# CHECK: KILL %t164 +# CHECK: %t166:sgpr_32 = S_NOT_B32 83 +# CHECK: KILL %t166 +# CHECK: %t168:sgpr_32 = S_NOT_B32 84 +# CHECK: KILL %t168 +# CHECK: %t170:sgpr_32 = S_NOT_B32 85 +# CHECK: KILL %t170 +# CHECK: %t172:sgpr_32 = S_NOT_B32 86 +# CHECK: KILL %t172 +# CHECK: %t174:sgpr_32 = S_NOT_B32 87 +# CHECK: KILL %t174 +# CHECK: %t176:sgpr_32 = S_NOT_B32 88 +# CHECK: KILL %t176 +# CHECK: %t178:sgpr_32 = S_NOT_B32 89 +# CHECK: KILL %t178 +# CHECK: %t180:sgpr_32 = S_NOT_B32 90 +# CHECK: KILL %t180 +# CHECK: %t182:sgpr_32 = S_NOT_B32 91 +# CHECK: KILL %t182 +# CHECK: %t184:sgpr_32 = S_NOT_B32 92 +# CHECK: KILL %t184 +# CHECK: %t186:sgpr_32 = S_NOT_B32 93 +# CHECK: KILL %t186 +# CHECK: %t188:sgpr_32 = S_NOT_B32 94 +# CHECK: KILL %t188 +# CHECK: %t190:sgpr_32 = S_NOT_B32 95 +# CHECK: KILL %t190 +# CHECK: %t192:sgpr_32 = S_NOT_B32 96 +# CHECK: KILL %t192 +# CHECK: %t194:sgpr_32 = S_NOT_B32 97 +# CHECK: KILL %t194 +# CHECK: %t196:sgpr_32 = S_NOT_B32 98 +# CHECK: KILL %t196 +# CHECK: %t198:sgpr_32 = S_NOT_B32 99 +# CHECK: KILL %t198 +# CHECK: %t200:sgpr_32 = S_NOT_B32 100 +# CHECK: KILL %t200 +# CHECK: %t202:sgpr_32 = S_NOT_B32 101 +# CHECK: KILL %t202 +# CHECK: %t204:sgpr_32 = S_NOT_B32 102 +# CHECK: KILL %t204 +# CHECK: %t206:sgpr_32 = S_NOT_B32 103 +# CHECK: KILL %t206 +# CHECK: %t208:sgpr_32 = S_NOT_B32 104 +# CHECK: KILL %t208 +# CHECK: %t210:sgpr_32 = S_NOT_B32 105 +# CHECK: KILL %t210 +# CHECK: %t212:sgpr_32 = S_NOT_B32 106 +# CHECK: KILL %t212 +# CHECK: %t214:sgpr_32 = S_NOT_B32 107 +# CHECK: KILL %t214 +# CHECK: %t216:sgpr_32 = S_NOT_B32 108 +# CHECK: KILL %t216 +# CHECK: %t218:sgpr_32 = S_NOT_B32 109 +# CHECK: KILL %t218 +# CHECK: %t220:sgpr_32 = S_NOT_B32 110 +# CHECK: KILL %t220 +# CHECK: %t222:sgpr_32 = S_NOT_B32 111 +# CHECK: KILL %t222 +# CHECK: %t224:sgpr_32 = S_NOT_B32 112 +# CHECK: KILL %t224 +# CHECK: %t226:sgpr_32 = S_NOT_B32 113 +# CHECK: KILL %t226 +# CHECK: %t228:sgpr_32 = S_NOT_B32 114 +# CHECK: KILL %t228 +# CHECK: %t230:sgpr_32 = S_NOT_B32 115 +# CHECK: KILL %t230 +# CHECK: %t232:sgpr_32 = S_NOT_B32 116 +# CHECK: KILL %t232 +# CHECK: %t234:sgpr_32 = S_NOT_B32 117 +# CHECK: KILL %t234 +# CHECK: %t236:sgpr_32 = S_NOT_B32 118 +# CHECK: KILL %t236 +# CHECK: %t238:sgpr_32 = S_NOT_B32 119 +# CHECK: KILL %t238 +# CHECK: %t240:sgpr_32 = S_NOT_B32 120 +# CHECK: KILL %t240 +# CHECK: %t242:sgpr_32 = S_NOT_B32 121 +# CHECK: KILL %t242 +# CHECK: %t244:sgpr_32 = S_NOT_B32 122 +# CHECK: KILL %t244 +# CHECK: %t246:sgpr_32 = S_NOT_B32 123 +# CHECK: KILL %t246 +# CHECK: %t248:sgpr_32 = S_NOT_B32 124 +# CHECK: KILL %t248 +# CHECK: %t250:sgpr_32 = S_NOT_B32 125 +# CHECK: KILL %t250 +# CHECK: %t252:sgpr_32 = S_NOT_B32 126 +# CHECK: KILL %t252 +# CHECK: %t254:sgpr_32 = S_NOT_B32 127 +# CHECK: KILL %t254 + + +--- | + define amdgpu_ps void @main() { + ret void + } +... +--- +name: main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + ; To inflate vgpr count + %v0:vreg_1024 = IMPLICIT_DEF + %v1:vreg_1024 = IMPLICIT_DEF + %v2:vreg_1024 = IMPLICIT_DEF + %v3:vreg_1024 = IMPLICIT_DEF + + ; Defs + %t0:sgpr_32 = S_NOT_B32 0, implicit-def $scc + %t2:sgpr_32 = S_NOT_B32 1, implicit-def $scc + %t4:sgpr_32 = S_NOT_B32 2, implicit-def $scc + %t6:sgpr_32 = S_NOT_B32 3, implicit-def $scc + %t8:sgpr_32 = S_NOT_B32 4, implicit-def $scc + %t10:sgpr_32 = S_NOT_B32 5, implicit-def $scc + %t12:sgpr_32 = S_NOT_B32 6, implicit-def $scc + %t14:sgpr_32 = S_NOT_B32 7, implicit-def $scc + %t16:sgpr_32 = S_NOT_B32 8, implicit-def $scc + %t18:sgpr_32 = S_NOT_B32 9, implicit-def $scc + %t20:sgpr_32 = S_NOT_B32 10, implicit-def $scc + %t22:sgpr_32 = S_NOT_B32 11, implicit-def $scc + %t24:sgpr_32 = S_NOT_B32 12, implicit-def $scc + %t26:sgpr_32 = S_NOT_B32 13, implicit-def $scc + %t28:sgpr_32 = S_NOT_B32 14, implicit-def $scc + %t30:sgpr_32 = S_NOT_B32 15, implicit-def $scc + %t32:sgpr_32 = S_NOT_B32 16, implicit-def $scc + %t34:sgpr_32 = S_NOT_B32 17, implicit-def $scc + %t36:sgpr_32 = S_NOT_B32 18, implicit-def $scc + %t38:sgpr_32 = S_NOT_B32 19, implicit-def $scc + %t40:sgpr_32 = S_NOT_B32 20, implicit-def $scc + %t42:sgpr_32 = S_NOT_B32 21, implicit-def $scc + %t44:sgpr_32 = S_NOT_B32 22, implicit-def $scc + %t46:sgpr_32 = S_NOT_B32 23, implicit-def $scc + %t48:sgpr_32 = S_NOT_B32 24, implicit-def $scc + %t50:sgpr_32 = S_NOT_B32 25, implicit-def $scc + %t52:sgpr_32 = S_NOT_B32 26, implicit-def $scc + %t54:sgpr_32 = S_NOT_B32 27, implicit-def $scc + %t56:sgpr_32 = S_NOT_B32 28, implicit-def $scc + %t58:sgpr_32 = S_NOT_B32 29, implicit-def $scc + %t60:sgpr_32 = S_NOT_B32 30, implicit-def $scc + %t62:sgpr_32 = S_NOT_B32 31, implicit-def $scc + %t64:sgpr_32 = S_NOT_B32 32, implicit-def $scc + %t66:sgpr_32 = S_NOT_B32 33, implicit-def $scc + %t68:sgpr_32 = S_NOT_B32 34, implicit-def $scc + %t70:sgpr_32 = S_NOT_B32 35, implicit-def $scc + %t72:sgpr_32 = S_NOT_B32 36, implicit-def $scc + %t74:sgpr_32 = S_NOT_B32 37, implicit-def $scc + %t76:sgpr_32 = S_NOT_B32 38, implicit-def $scc + %t78:sgpr_32 = S_NOT_B32 39, implicit-def $scc + %t80:sgpr_32 = S_NOT_B32 40, implicit-def $scc + %t82:sgpr_32 = S_NOT_B32 41, implicit-def $scc + %t84:sgpr_32 = S_NOT_B32 42, implicit-def $scc + %t86:sgpr_32 = S_NOT_B32 43, implicit-def $scc + %t88:sgpr_32 = S_NOT_B32 44, implicit-def $scc + %t90:sgpr_32 = S_NOT_B32 45, implicit-def $scc + %t92:sgpr_32 = S_NOT_B32 46, implicit-def $scc + %t94:sgpr_32 = S_NOT_B32 47, implicit-def $scc + %t96:sgpr_32 = S_NOT_B32 48, implicit-def $scc + %t98:sgpr_32 = S_NOT_B32 49, implicit-def $scc + %t100:sgpr_32 = S_NOT_B32 50, implicit-def $scc + %t102:sgpr_32 = S_NOT_B32 51, implicit-def $scc + %t104:sgpr_32 = S_NOT_B32 52, implicit-def $scc + %t106:sgpr_32 = S_NOT_B32 53, implicit-def $scc + %t108:sgpr_32 = S_NOT_B32 54, implicit-def $scc + %t110:sgpr_32 = S_NOT_B32 55, implicit-def $scc + %t112:sgpr_32 = S_NOT_B32 56, implicit-def $scc + %t114:sgpr_32 = S_NOT_B32 57, implicit-def $scc + %t116:sgpr_32 = S_NOT_B32 58, implicit-def $scc + %t118:sgpr_32 = S_NOT_B32 59, implicit-def $scc + %t120:sgpr_32 = S_NOT_B32 60, implicit-def $scc + %t122:sgpr_32 = S_NOT_B32 61, implicit-def $scc + %t124:sgpr_32 = S_NOT_B32 62, implicit-def $scc + %t126:sgpr_32 = S_NOT_B32 63, implicit-def $scc + %t128:sgpr_32 = S_NOT_B32 64, implicit-def $scc + %t130:sgpr_32 = S_NOT_B32 65, implicit-def $scc + %t132:sgpr_32 = S_NOT_B32 66, implicit-def $scc + %t134:sgpr_32 = S_NOT_B32 67, implicit-def $scc + %t136:sgpr_32 = S_NOT_B32 68, implicit-def $scc + %t138:sgpr_32 = S_NOT_B32 69, implicit-def $scc + %t140:sgpr_32 = S_NOT_B32 70, implicit-def $scc + %t142:sgpr_32 = S_NOT_B32 71, implicit-def $scc + %t144:sgpr_32 = S_NOT_B32 72, implicit-def $scc + %t146:sgpr_32 = S_NOT_B32 73, implicit-def $scc + %t148:sgpr_32 = S_NOT_B32 74, implicit-def $scc + %t150:sgpr_32 = S_NOT_B32 75, implicit-def $scc + %t152:sgpr_32 = S_NOT_B32 76, implicit-def $scc + %t154:sgpr_32 = S_NOT_B32 77, implicit-def $scc + %t156:sgpr_32 = S_NOT_B32 78, implicit-def $scc + %t158:sgpr_32 = S_NOT_B32 79, implicit-def $scc + %t160:sgpr_32 = S_NOT_B32 80, implicit-def $scc + %t162:sgpr_32 = S_NOT_B32 81, implicit-def $scc + %t164:sgpr_32 = S_NOT_B32 82, implicit-def $scc + %t166:sgpr_32 = S_NOT_B32 83, implicit-def $scc + %t168:sgpr_32 = S_NOT_B32 84, implicit-def $scc + %t170:sgpr_32 = S_NOT_B32 85, implicit-def $scc + %t172:sgpr_32 = S_NOT_B32 86, implicit-def $scc + %t174:sgpr_32 = S_NOT_B32 87, implicit-def $scc + %t176:sgpr_32 = S_NOT_B32 88, implicit-def $scc + %t178:sgpr_32 = S_NOT_B32 89, implicit-def $scc + %t180:sgpr_32 = S_NOT_B32 90, implicit-def $scc + %t182:sgpr_32 = S_NOT_B32 91, implicit-def $scc + %t184:sgpr_32 = S_NOT_B32 92, implicit-def $scc + %t186:sgpr_32 = S_NOT_B32 93, implicit-def $scc + %t188:sgpr_32 = S_NOT_B32 94, implicit-def $scc + %t190:sgpr_32 = S_NOT_B32 95, implicit-def $scc + %t192:sgpr_32 = S_NOT_B32 96, implicit-def $scc + %t194:sgpr_32 = S_NOT_B32 97, implicit-def $scc + %t196:sgpr_32 = S_NOT_B32 98, implicit-def $scc + %t198:sgpr_32 = S_NOT_B32 99, implicit-def $scc + %t200:sgpr_32 = S_NOT_B32 100, implicit-def $scc + %t202:sgpr_32 = S_NOT_B32 101, implicit-def $scc + %t204:sgpr_32 = S_NOT_B32 102, implicit-def $scc + %t206:sgpr_32 = S_NOT_B32 103, implicit-def $scc + %t208:sgpr_32 = S_NOT_B32 104, implicit-def $scc + %t210:sgpr_32 = S_NOT_B32 105, implicit-def $scc + %t212:sgpr_32 = S_NOT_B32 106, implicit-def $scc + %t214:sgpr_32 = S_NOT_B32 107, implicit-def $scc + %t216:sgpr_32 = S_NOT_B32 108, implicit-def $scc + %t218:sgpr_32 = S_NOT_B32 109, implicit-def $scc + %t220:sgpr_32 = S_NOT_B32 110, implicit-def $scc + %t222:sgpr_32 = S_NOT_B32 111, implicit-def $scc + %t224:sgpr_32 = S_NOT_B32 112, implicit-def $scc + %t226:sgpr_32 = S_NOT_B32 113, implicit-def $scc + %t228:sgpr_32 = S_NOT_B32 114, implicit-def $scc + %t230:sgpr_32 = S_NOT_B32 115, implicit-def $scc + %t232:sgpr_32 = S_NOT_B32 116, implicit-def $scc + %t234:sgpr_32 = S_NOT_B32 117, implicit-def $scc + %t236:sgpr_32 = S_NOT_B32 118, implicit-def $scc + %t238:sgpr_32 = S_NOT_B32 119, implicit-def $scc + %t240:sgpr_32 = S_NOT_B32 120, implicit-def $scc + %t242:sgpr_32 = S_NOT_B32 121, implicit-def $scc + %t244:sgpr_32 = S_NOT_B32 122, implicit-def $scc + %t246:sgpr_32 = S_NOT_B32 123, implicit-def $scc + %t248:sgpr_32 = S_NOT_B32 124, implicit-def $scc + %t250:sgpr_32 = S_NOT_B32 125, implicit-def $scc + %t252:sgpr_32 = S_NOT_B32 126, implicit-def $scc + %t254:sgpr_32 = S_NOT_B32 127, implicit-def $scc + + + ; Branch + %cmp:sreg_32_xm0 = V_CMP_GT_F32_e64 0, 0, 0, %v0.sub0, 0, implicit $exec, implicit $mode + $exec_lo = S_MOV_B32_term %cmp:sreg_32_xm0 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + S_BRANCH %bb.2 + + bb.2: + + ; Uses + KILL %t0 + KILL %t2 + KILL %t4 + KILL %t6 + KILL %t8 + KILL %t10 + KILL %t12 + KILL %t14 + KILL %t16 + KILL %t18 + KILL %t20 + KILL %t22 + KILL %t24 + KILL %t26 + KILL %t28 + KILL %t30 + KILL %t32 + KILL %t34 + KILL %t36 + KILL %t38 + KILL %t40 + KILL %t42 + KILL %t44 + KILL %t46 + KILL %t48 + KILL %t50 + KILL %t52 + KILL %t54 + KILL %t56 + KILL %t58 + KILL %t60 + KILL %t62 + KILL %t64 + KILL %t66 + KILL %t68 + KILL %t70 + KILL %t72 + KILL %t74 + KILL %t76 + KILL %t78 + KILL %t80 + KILL %t82 + KILL %t84 + KILL %t86 + KILL %t88 + KILL %t90 + KILL %t92 + KILL %t94 + KILL %t96 + KILL %t98 + KILL %t100 + KILL %t102 + KILL %t104 + KILL %t106 + KILL %t108 + KILL %t110 + KILL %t112 + KILL %t114 + KILL %t116 + KILL %t118 + KILL %t120 + KILL %t122 + KILL %t124 + KILL %t126 + KILL %t128 + KILL %t130 + KILL %t132 + KILL %t134 + KILL %t136 + KILL %t138 + KILL %t140 + KILL %t142 + KILL %t144 + KILL %t146 + KILL %t148 + KILL %t150 + KILL %t152 + KILL %t154 + KILL %t156 + KILL %t158 + KILL %t160 + KILL %t162 + KILL %t164 + KILL %t166 + KILL %t168 + KILL %t170 + KILL %t172 + KILL %t174 + KILL %t176 + KILL %t178 + KILL %t180 + KILL %t182 + KILL %t184 + KILL %t186 + KILL %t188 + KILL %t190 + KILL %t192 + KILL %t194 + KILL %t196 + KILL %t198 + KILL %t200 + KILL %t202 + KILL %t204 + KILL %t206 + KILL %t208 + KILL %t210 + KILL %t212 + KILL %t214 + KILL %t216 + KILL %t218 + KILL %t220 + KILL %t222 + KILL %t224 + KILL %t226 + KILL %t228 + KILL %t230 + KILL %t232 + KILL %t234 + KILL %t236 + KILL %t238 + KILL %t240 + KILL %t242 + KILL %t244 + KILL %t246 + KILL %t248 + KILL %t250 + KILL %t252 + KILL %t254 + + + + ; Some uses to inflate vgpr count + KILL %v0 + KILL %v1 + KILL %v2 + KILL %v3 + S_ENDPGM 0 +... + \ No newline at end of file